diff options
Diffstat (limited to 'kernel/drivers/infiniband/hw/mlx5')
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/Kconfig | 10 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/Makefile | 4 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/ah.c | 92 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/cq.c | 1189 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/doorbell.c | 98 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/mad.c | 139 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/main.c | 1397 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/mem.c | 225 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h | 669 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/mr.c | 1479 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/odp.c | 798 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/qp.c | 3174 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/srq.c | 485 | ||||
-rw-r--r-- | kernel/drivers/infiniband/hw/mlx5/user.h | 133 |
14 files changed, 9892 insertions, 0 deletions
diff --git a/kernel/drivers/infiniband/hw/mlx5/Kconfig b/kernel/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 000000000..10df386c6 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,10 @@ +config MLX5_INFINIBAND + tristate "Mellanox Connect-IB HCA support" + depends on NETDEVICES && ETHERNET && PCI + select NET_VENDOR_MELLANOX + select MLX5_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/kernel/drivers/infiniband/hw/mlx5/Makefile b/kernel/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 000000000..27a70159e --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/kernel/drivers/infiniband/hw/mlx5/ah.c b/kernel/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 000000000..66080580e --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah) +{ + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + return create_ib_ah(ah_attr, ah); /* never fails */ +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/cq.c b/kernel/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 000000000..2ee6b1051 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + uint16_t idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev->priv.mr_table.lock); + mmr = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + read_unlock(&dev->mdev->priv.mr_table.lock); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev->priv.mr_table.lock); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, + MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), + to_mcq(ibcq)->mcq.cons_index); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) + return err; + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_umem; + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev->priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0) + return ERR_PTR(-EINVAL); + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + /* for now choose 64 bytes till we have a proper interface */ + cqe_size = 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + return -ENOSYS; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); + kfree(in); + + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/doorbell.c b/kernel/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 000000000..a0e4e6ddb --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kref.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/kernel/drivers/infiniband/hw/mlx5/mad.c b/kernel/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 000000000..9cf9a37bb --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/mlx5/cmd.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_smi.h> +#include "mlx5_ib.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/main.c b/kernel/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 000000000..57c9809e8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <asm-generic/kmap_types.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/io-mapping.h> +#include <linux/sched.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include "user.h" +#include "mlx5_ib.h" + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" + +MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>"); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static int deprecated_prof_sel = 2; +module_param_named(prof_sel, deprecated_prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int err = -ENOMEM; + int max_rq_sg; + int max_sq_sg; + u64 flags; + + gen = &dev->mdev->caps.gen; + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof(*props)); + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + (fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + flags = gen->flags; + if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_APM) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (flags & MLX5_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *)(out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = gen->min_page_sz; + props->max_qp = 1 << gen->log_max_qp; + props->max_qp_wr = gen->max_wqes; + max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << gen->log_max_cq; + props->max_cqe = gen->max_cqes - 1; + props->max_mr = 1 << gen->log_max_mkey; + props->max_pd = 1 << gen->log_max_pd; + props->max_qp_rd_atom = 1 << gen->log_max_ra_req_qp; + props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp; + props->max_srq = 1 << gen->log_max_srq; + props->max_srq_wr = gen->max_srq_wqes - 1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + props->max_mcast_grp = 1 << gen->log_max_mcg; + props->max_mcast_qp_attach = gen->max_qp_mcg; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; +#endif + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int ext_active_speed; + int err = -ENOMEM; + + gen = &dev->mdev->caps.gen; + if (port < 1 || port > gen->num_ports) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << gen->log_max_msg; + props->pkey_tbl_len = gen->port[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (gen->ext_port_cap[port - 1] & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_general_caps *gen; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + size_t reqlen; + + gen = &dev->mdev->caps.gen; + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) + return ERR_PTR(err); + + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + + if (req.total_num_uuars > MLX5_MAX_UUARS) + return ERR_PTR(-ENOMEM); + + if (req.total_num_uuars == 0) + return ERR_PTR(-EINVAL); + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) + return ERR_PTR(-EINVAL); + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << gen->log_max_qp; + resp.bf_reg_size = gen->bf_reg_size; + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = gen->max_sq_desc_sz; + resp.max_rq_desc_sz = gen->max_rq_desc_sz; + resp.max_send_wqebb = gen->max_wqes; + resp.max_recv_wr = gen->max_wqes; + resp.max_srq_recv_wr = gen->max_srq_wqes; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); + if (err) + goto out_count; + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; +#endif + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = gen->num_ports; + err = ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved)); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(dev->mdev, uars[i].index); +out_count: + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + unsigned long idx; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->mdev->rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct ib_event ibev; + + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + ibev.event = IB_EVENT_PORT_ERR; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_INITIALIZED: + /* not used by ULPs */ + return; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = (u8)param; + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if (port < 1 || port > ibdev->num_ports) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + struct mlx5_general_caps *gen; + int port; + + gen = &dev->mdev->caps.gen; + for (port = 1; port <= gen->num_ports; port++) + mlx5_query_ext_port_caps(dev, port); +} + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + struct mlx5_general_caps *gen; + int err = -ENOMEM; + int port; + + gen = &dev->mdev->caps.gen; + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + + for (port = 1; port <= gen->num_ports; port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); + break; + } + gen->port[port - 1].pkey_table_len = dprops->max_pkeys; + gen->port[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_destroy_cq(dev->umrc.cq); + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128, + 0); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_destroy_cq(cq); + +error_2: + ib_dereg_mr(mr); + +error_1: + ib_dealloc_pd(pd); + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + return 0; + +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + + err = get_port_caps(dev); + if (err) + goto err_dealloc; + + get_ext_port_caps(dev); + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->caps.gen.reserved_lkey; + dev->num_ports = mdev->caps.gen.num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = + dev->mdev->priv.eq_table.num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + + mlx5_ib_internal_query_odp_caps(dev); + + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + err = init_node_data(dev); + if (err) + goto err_dealloc; + + mutex_init(&dev->cap_mask_mutex); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_dealloc; + + err = mlx5_ib_odp_init_one(dev); + if (err) + goto err_rsrc; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_odp; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_umrc; + } + + dev->ib_active = true; + + return dev; + +err_umrc: + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_odp: + mlx5_ib_odp_remove_one(dev); + +err_rsrc: + destroy_dev_resources(&dev->devr); + +err_dealloc: + ib_dealloc_device((struct ib_device *)dev); + + return NULL; +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_dev *dev = context; + + ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); + mlx5_ib_odp_remove_one(dev); + destroy_dev_resources(&dev->devr); + ib_dealloc_device(&dev->ib_dev); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, + .protocol = MLX5_INTERFACE_PROTOCOL_IB, +}; + +static int __init mlx5_ib_init(void) +{ + int err; + + if (deprecated_prof_sel != 2) + pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + err = mlx5_ib_odp_init(); + if (err) + return err; + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + return err; + +clean_odp: + mlx5_ib_odp_cleanup(); + return err; +} + +static void __exit mlx5_ib_cleanup(void) +{ + mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_ib_odp_cleanup(); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/kernel/drivers/infiniband/hw/mlx5/mem.c b/kernel/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 000000000..40df2cca0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + unsigned long tmp; + unsigned long m; + int i, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + int entry; + unsigned long page_shift = ilog2(umem->page_size); + + /* With ODP we must always match OS page size. */ + if (umem->odp_data) { + *count = ib_umem_page_count(umem); + *shift = PAGE_SHIFT; + *ncont = *count; + if (order) + *order = ilog2(roundup_pow_of_two(*count)); + + return; + } + + addr = addr >> page_shift; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (unsigned long)pfn; + m = min_t(unsigned long, m, find_first_bit(&tmp, sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (unsigned long)p; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags) +{ + unsigned long umem_page_shift = ilog2(umem->page_size); + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + cur |= access_flags; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + base + (k << umem_page_shift)); + i++; + } + } +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + ib_umem_num_pages(umem), pas, + access_flags); +} +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = buf_off >> ilog2(off_size); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h b/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 000000000..dff1cfcdf --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,669 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/cq.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/types.h> + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_ib_wq { + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +/* + * Connect-IB can trigger up to four concurrent pagefaults + * per-QP. + */ +enum mlx5_ib_pagefault_context { + MLX5_IB_PAGEFAULT_RESPONDER_READ, + MLX5_IB_PAGEFAULT_REQUESTOR_READ, + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, + MLX5_IB_PAGEFAULT_CONTEXTS +}; + +static inline enum mlx5_ib_pagefault_context + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) +{ + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); +} + +struct mlx5_ib_pfault { + struct work_struct work; + struct mlx5_pagefault mpfault; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * A flag that is true for QP's that are in a state that doesn't + * allow page faults, and shouldn't schedule any more faults. + */ + int disable_page_faults; + /* + * The disable_page_faults_lock protects a QP's disable_page_faults + * field, allowing for a thread to atomically check whether the QP + * allows page faults, and if so schedule a page fault. + */ + spinlock_t disable_page_faults_lock; + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; +#endif +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, +}; + +struct mlx5_umr_wr { + union { + u64 virt_addr; + u64 offset; + } target; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; + int live; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + struct dentry *dir; + char name[4]; + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + unsigned long last_add; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_odp_caps odp_caps; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; +#endif +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, + int npages, int zap); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +extern struct workqueue_struct *mlx5_ib_page_fault_wq; + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault); +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end); + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + return 0; +} + +static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +#endif /* MLX5_IB_H */ diff --git a/kernel/drivers/infiniband/hw/mlx5/mr.c b/kernel/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 000000000..71c593583 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,1479 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include <linux/kref.h> +#include <linux/random.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include <rdma/ib_verbs.h> +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, +}; + +#define MLX5_UMR_ALIGN 2048 +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static __be64 mlx5_ib_update_mtt_emergency_buffer[ + MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] + __aligned(MLX5_UMR_ALIGN); +static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); +#endif + +static int clean_mr(struct mlx5_ib_mr *mr); + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); +#endif + + return err; +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, c, ent->size - var); + } + + return count; +} + +static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = size_write, + .read = size_read, +}; + +static ssize_t limit_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, c, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations limit_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = limit_write, + .read = limit_read, +}; + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } + } else if (ent->cur > 2 * ent->limit) { + if (!someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + } + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + + if (mr) + break; + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + queue_work(cache->wq, &ent->work); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + if (!mlx5_debugfs_root) + return 0; + + cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); + if (!cache->root) + return -ENOMEM; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + sprintf(ent->name, "%d", ent->order); + ent->dir = debugfs_create_dir(ent->name, cache->root); + if (!ent->dir) + return -ENOMEM; + + ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent, + &size_fops); + if (!ent->fsize) + return -ENOMEM; + + ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent, + &limit_fops); + if (!ent->flimit) + return -ENOMEM; + + ent->fcur = debugfs_create_u32("cur", 0400, ent->dir, + &ent->cur); + if (!ent->fcur) + return -ENOMEM; + + ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir, + &ent->miss); + if (!ent->fmiss) + return -ENOMEM; + } + + return 0; +} + +static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->cache.root); +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) + limit = dev->mdev->profile->mr_cache[i].limit; + else + limit = 0; + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + queue_work(cache->wq, &ent->work); + } + + err = mlx5_mr_cache_debugfs_init(dev); + if (err) + mlx5_ib_warn(dev, "cache debugfs failure\n"); + + return 0; +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + + mlx5_mr_cache_debugfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +static int use_umr(int order) +{ + return order <= MLX5_MAX_UMR_SHIFT; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + + umrwr->npages = n; + umrwr->page_shift = page_shift; + umrwr->mkey = key; + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr->opcode = MLX5_IB_WR_UMR; + umrwr->mkey = key; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + pr_warn("poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size; + __be64 *mr_pas; + __be64 *pas; + dma_addr_t dma; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. */ + size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr_pas) { + err = -ENOMEM; + goto free_mr; + } + + pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + err = -ENOMEM; + goto free_pas; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, + virt_addr, len, access_flags); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + + mr->live = 1; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + kfree(mr_pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, + int zap) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_umem *umem = mr->umem; + int size; + __be64 *pas; + dma_addr_t dma; + struct ib_send_wr wr, *bad; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; + struct ib_sge sg; + int err = 0; + const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); + const int page_index_mask = page_index_alignment - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + int use_emergency_buf = 0; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly */ + if (start_page_index & page_index_mask) { + npages += start_page_index & page_index_mask; + start_page_index &= ~page_index_mask; + } + + pages_to_map = ALIGN(npages, page_index_alignment); + + if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) + return -EINVAL; + + size = sizeof(u64) * pages_to_map; + size = min_t(int, PAGE_SIZE, size); + /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim + * code, when we are called from an invalidation. The pas buffer must + * be 2k-aligned for Connect-IB. */ + pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); + if (!pas) { + mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); + pas = mlx5_ib_update_mtt_emergency_buffer; + size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; + use_emergency_buf = 1; + mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + memset(pas, 0, size); + } + pages_iter = size / sizeof(u64); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + err = -ENOMEM; + goto free_pas; + } + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, start_page_index += pages_iter) { + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + + npages = min_t(size_t, + pages_iter, + ib_umem_num_pages(umem) - start_page_index); + + if (!zap) { + __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, + start_page_index, npages, pas, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages brought from the + * umem. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + } + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + + sg.addr = dma; + sg.length = ALIGN(npages * sizeof(u64), + MLX5_UMR_MTT_ALIGNMENT); + sg.lkey = dev->umrc.mr->lkey; + + wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_MTT; + wr.sg_list = &sg; + wr.num_sge = 1; + wr.opcode = MLX5_IB_WR_UMR; + umrwr->npages = sg.length / sizeof(u64); + umrwr->page_shift = PAGE_SHIFT; + umrwr->mkey = mr->mmr.key; + umrwr->target.offset = start_page_index; + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_err(dev, "UMR completion failed, code %d\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + if (!use_emergency_buf) + free_page((unsigned long)pas); + else + mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + + return err; +} +#endif + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + bool pg_cap = !!(dev->mdev->caps.gen.flags & + MLX5_DEV_CAP_FLAG_ON_DMND_PG); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mr->dev = dev; + mr->live = 1; + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + kvfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + umem = ib_umem_get(pd->uobject->context, start, length, access_flags, + 0); + if (IS_ERR(umem)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; + } + + if (!mr) + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, + access_flags); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +#endif + + return &mr->ibmr; + +error: + /* + * Destroy the umem *before* destroying the MR, to ensure we + * will not have any in-flight notifiers when destroying the + * MR. + * + * As the MR is completely invalid to begin with, and this + * error path is only taken if we can't push the mr entry into + * the pagefault tree, this is safe. + */ + + ib_umem_release(umem); + /* Kill the MR, and return an error code. */ + clean_mr(mr); + return ERR_PTR(err); +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + int err; + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_dbg(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +static int clean_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int umred = mr->umred; + int err; + + if (!umred) { + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + free_cached_mr(dev, mr); + } + + if (!umred) + kfree(mr); + + return 0; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem && umem->odp_data) { + /* Prevent new page faults from succeeding */ + mr->live = 0; + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); + /* Destroy all page mappings */ + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + /* + * We kill the umem before the MR for ODP, + * so that there will not be any invalidations in + * flight, looking at the *mr struct. + */ + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + + /* Avoid double-freeing the umem. */ + umem = NULL; + } +#endif + + clean_mr(mr); + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + + return 0; +} + +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) + goto err_free; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/odp.c b/kernel/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000..5099db08a --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/odp.c @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> + +#include "mlx5_ib.h" + +#define MAX_PREFETCH_LEN (4*1024*1024U) + +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 1000 + +struct workqueue_struct *mlx5_ib_page_fault_wq; + +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end) +{ + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + int in_block = 0; + u64 addr; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return; + } + + mr = umem->odp_data->private; + + if (!mr || !mr->ibmr.pd) + return; + + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + + for (addr = start; addr < end; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem->odp_data->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_mtt(mr, blk_start_idx, + idx - blk_start_idx, 1); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, + 1); + + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem, start, end); +} + +#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ + if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ + ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ +} while (0) + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + int err; + struct mlx5_odp_caps hw_caps; + struct ib_odp_caps *caps = &dev->odp_caps; + + memset(caps, 0, sizeof(*caps)); + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) + return 0; + + err = mlx5_query_odp_caps(dev->mdev, &hw_caps); + if (err) + goto out; + + caps->general_caps = IB_ODP_SUPPORT; + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + RECV); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + WRITE); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + READ); + +out: + return err; +} + +static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, + u32 key) +{ + u32 base_key = mlx5_base_mkey(key); + struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + + if (!mmr || mmr->key != key || !mr->live) + return NULL; + + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + int error) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + pfault->mpfault.flags, + error); + if (ret) + pr_err("Failed to resolve the page fault on QP 0x%x\n", + qp->mqp.qpn); +} + +/* + * Handle a single data segment in a page-fault WQE. + * + * Returns number of pages retrieved on success. The caller will continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling and possibly move the QP to an error state. + * On other errors the QP should also be closed with an error. + */ +static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_mapped) +{ + struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); + int srcu_key; + unsigned int current_seq; + u64 start_idx; + int npages = 0, ret = 0; + struct mlx5_ib_mr *mr; + u64 access_mask = ODP_READ_ALLOWED_BIT; + + srcu_key = srcu_read_lock(&mib_dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + /* + * If we didn't find the MR, it means the MR was closed while we were + * handling the ODP event. In this case we return -EFAULT so that the + * QP will be closed. + */ + if (!mr || !mr->ibmr.pd) { + pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); + ret = -EFAULT; + goto srcu_unlock; + } + if (!mr->umem->odp_data) { + pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += + (bcnt - pfault->mpfault.bytes_committed); + goto srcu_unlock; + } + if (mr->ibmr.pd != qp->ibqp.pd) { + pr_err("Page-fault with different PDs for QP and MR.\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); + + /* + * Avoid branches - this code will perform correctly + * in all iterations (in iteration 2 and above, + * bytes_committed == 0). + */ + io_virt += pfault->mpfault.bytes_committed; + bcnt -= pfault->mpfault.bytes_committed; + + start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + + if (mr->umem->writable) + access_mask |= ODP_WRITE_ALLOWED_BIT; + npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, + access_mask, current_seq); + if (npages < 0) { + ret = npages; + goto srcu_unlock; + } + + if (npages > 0) { + mutex_lock(&mr->umem->odp_data->umem_mutex); + if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + } else { + ret = -EAGAIN; + } + mutex_unlock(&mr->umem->odp_data->umem_mutex); + if (ret < 0) { + if (ret != -EAGAIN) + pr_err("Failed to update mkey page tables\n"); + goto srcu_unlock; + } + + if (bytes_mapped) { + u32 new_mappings = npages * PAGE_SIZE - + (io_virt - round_down(io_virt, PAGE_SIZE)); + *bytes_mapped += min_t(u32, new_mappings, bcnt); + } + } + +srcu_unlock: + if (ret == -EAGAIN) { + if (!mr->umem->odp_data->dying) { + struct ib_umem_odp *odp_data = mr->umem->odp_data; + unsigned long timeout = + msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); + + if (!wait_for_completion_timeout( + &odp_data->notifier_completion, + timeout)) { + pr_warn("timeout waiting for mmu notifier completion\n"); + } + } else { + /* The MR is being killed, kill the QP as well. */ + ret = -EFAULT; + } + } + srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); + pfault->mpfault.bytes_committed = 0; + return ret ? ret : npages; +} + +/** + * Parse a series of data segments for page fault handling. + * + * @qp the QP on which the fault occurred. + * @pfault contains page fault information. + * @wqe points at the first data segment in the WQE. + * @wqe_end points after the end of the WQE. + * @bytes_mapped receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes receives the total data size of this WQE in bytes (minus + * the committed bytes). + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, int receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + /* Skip SRQ next-WQE segment. */ + if (receive_queue && qp->ibqp.srq) + wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { + pfault->mpfault.bytes_committed -= + min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(qp, pfault, key, io_virt, + bcnt, bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + unsigned ds, opcode; +#if defined(DEBUG) + u32 ctrl_wqe_index, ctrl_qpn; +#endif + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qp->mqp.qpn); + return -EFAULT; + } + +#if defined(DEBUG) + ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_WQE_INDEX_MASK) >> + MLX5_WQE_CTRL_WQE_INDEX_SHIFT; + if (wqe_index != ctrl_wqe_index) { + mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_wqe_index); + return -EFAULT; + } + + ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> + MLX5_WQE_CTRL_QPN_SHIFT; + if (qp->mqp.qpn != ctrl_qpn) { + mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_qpn); + return -EFAULT; + } +#endif /* DEBUG */ + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + case MLX5_OPCODE_SEND_INVAL: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + break; + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_RDMA_READ: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + case IB_QPT_UD: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_datagram_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", + qp->ibqp.qp_type, opcode); + return -EFAULT; + } + + return 0; +} + +/* + * Parse responder WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->ibqp.srq) { + mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); + return -EFAULT; + } + + if (qp->wq_sig) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_RECV)) + goto invalid_transport_or_opcode; + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + + return 0; +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret; + void *wqe, *wqe_end; + u32 bytes_mapped, total_wqe_bytes; + char *buffer = NULL; + int resume_with_error = 0; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, + PAGE_SIZE); + if (ret < 0) { + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", + -ret, wqe_index, qp->mqp.qpn); + resume_with_error = 1; + goto resolve_page_fault; + } + + wqe = buffer; + if (requestor) + ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + else + ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + if (ret < 0) { + resume_with_error = 1; + goto resolve_page_fault; + } + + if (wqe >= wqe_end) { + mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, + &total_wqe_bytes, !requestor); + if (ret == -EAGAIN) { + goto resolve_page_fault; + } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { + mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", + -ret); + resume_with_error = 1; + goto resolve_page_fault; + } + +resolve_page_fault: + mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", + qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + + free_page((unsigned long)buffer); +} + +static int pages_in_range(u64 address, u32 length) +{ + return (ALIGN(address + length, PAGE_SIZE) - + (address & PAGE_MASK)) >> PAGE_SHIFT; +} + +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_pagefault *mpfault = &pfault->mpfault; + u64 address; + u32 length; + u32 prefetch_len = mpfault->bytes_committed; + int prefetch_activated = 0; + u32 rkey = mpfault->rdma.r_key; + int ret; + + /* The RDMA responder handler handles the page fault in two parts. + * First it brings the necessary pages for the current packet + * (and uses the pfault context), and then (after resuming the QP) + * prefetches more pages. The second operation cannot use the pfault + * context and therefore uses the dummy_pfault context allocated on + * the stack */ + struct mlx5_ib_pfault dummy_pfault = {}; + + dummy_pfault.mpfault.bytes_committed = 0; + + mpfault->rdma.rdma_va += mpfault->bytes_committed; + mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, + mpfault->rdma.rdma_op_len); + mpfault->bytes_committed = 0; + + address = mpfault->rdma.rdma_va; + length = mpfault->rdma.rdma_op_len; + + /* For some operations, the hardware cannot tell the exact message + * length, and in those cases it reports zero. Use prefetch + * logic. */ + if (length == 0) { + prefetch_activated = 1; + length = mpfault->rdma.packet_size; + prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); + } + + ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, + NULL); + if (ret == -EAGAIN) { + /* We're racing with an invalidation, don't prefetch */ + prefetch_activated = 0; + } else if (ret < 0 || pages_in_range(address, length) > ret) { + mlx5_ib_page_fault_resume(qp, pfault, 1); + return; + } + + mlx5_ib_page_fault_resume(qp, pfault, 0); + + /* At this point, there might be a new pagefault already arriving in + * the eq, switch to the dummy pagefault for the rest of the + * processing. We're still OK with the objects being alive as the + * work-queue is being fenced. */ + + if (prefetch_activated) { + ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, + address, + prefetch_len, + NULL); + if (ret < 0) { + pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", + ret, prefetch_activated, + qp->ibqp.qp_num, address, prefetch_len); + } + } +} + +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + u8 event_subtype = pfault->mpfault.event_subtype; + + switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + break; + case MLX5_PFAULT_SUBTYPE_RDMA: + mlx5_ib_mr_rdma_pfault_handler(qp, pfault); + break; + default: + pr_warn("Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(qp, pfault, 1); + break; + } +} + +static void mlx5_ib_qp_pfault_action(struct work_struct *work) +{ + struct mlx5_ib_pfault *pfault = container_of(work, + struct mlx5_ib_pfault, + work); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(&pfault->mpfault); + struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, + pagefaults[context]); + mlx5_ib_mr_pfault_handler(qp, pfault); +} + +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 1; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); + + /* + * Note that at this point, we are guarenteed that no more + * work queue elements will be posted to the work queue with + * the QP we are closing. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +} + +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 0; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); +} + +static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, + struct mlx5_pagefault *pfault) +{ + /* + * Note that we will only get one fault event per QP per context + * (responder/initiator, read/write), until we resolve the page fault + * with the mlx5_ib_page_fault_resume command. Since this function is + * called from within the work element, there is no risk of missing + * events. + */ + struct mlx5_ib_qp *mibqp = to_mibqp(qp); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(pfault); + struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; + + qp_pfault->mpfault = *pfault; + + /* No need to stop interrupts here since we are in an interrupt */ + spin_lock(&mibqp->disable_page_faults_lock); + if (!mibqp->disable_page_faults) + queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); + spin_unlock(&mibqp->disable_page_faults_lock); +} + +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) +{ + int i; + + qp->disable_page_faults = 1; + spin_lock_init(&qp->disable_page_faults_lock); + + qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + + for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) + INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +{ + int ret; + + ret = init_srcu_struct(&ibdev->mr_srcu); + if (ret) + return ret; + + return 0; +} + +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +{ + cleanup_srcu_struct(&ibdev->mr_srcu); +} + +int __init mlx5_ib_odp_init(void) +{ + mlx5_ib_page_fault_wq = + create_singlethread_workqueue("mlx5_ib_page_faults"); + if (!mlx5_ib_page_fault_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_ib_odp_cleanup(void) +{ + destroy_workqueue(mlx5_ib_page_fault_wq); +} diff --git a/kernel/drivers/infiniband/hw/mlx5/qp.c b/kernel/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 000000000..d35f62d4f --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,3174 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <rdma/ib_umem.h> +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int wq_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length) +{ + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = qp->umem; + u32 first_copy_length; + int wqe_length; + int ret; + + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } + + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); + + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); + if (ret) + return ret; + + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; + } + + if (wqe_length <= first_copy_length) + return first_copy_length; + + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; + + return wqe_length; +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > gen->max_wqes) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > gen->max_rq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + gen->max_rq_desc_sz); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > gen->max_sq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, gen->max_sq_desc_sz); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int desc_sz = 1 << qp->sq.wqe_shift; + + gen = &dev->mdev->caps.gen; + if (desc_sz > gen->max_sq_desc_sz) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, gen->max_sq_desc_sz); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -EINVAL; + } + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct mlx5_create_qp_mbox_in **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + + context = to_mucontext(pd->uobject->context); + /* + * TBD: should come from the verbs when we have the API + */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, &ucmd); + if (err) + goto err_uuar; + + if (ucmd.buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp->uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, resp, sizeof(*resp)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + kvfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev->priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + +err_free: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev->priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + mlx5_buf_free(dev->mdev, &qp->buf); + free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == IB_QPT_XRC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_ib_create_qp_resp resp; + struct mlx5_create_qp_mbox_in *in; + struct mlx5_general_caps *gen; + struct mlx5_ib_create_qp ucmd; + int inlen = sizeof(*in); + int err; + + mlx5_ib_odp_create_qp(qp); + + gen = &dev->mdev->caps.gen; + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + } else { + qp->wq_sig = !!wq_signature; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? &ucmd : NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > gen->max_wqes) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, gen->max_wqes); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } else { + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 | + MLX5_QP_PM_MIGRATED << 11); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + else + in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + else + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + if (err) { + mlx5_ib_dbg(dev, "create qp failed\n"); + goto err_create; + } + + kvfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx5_ib_qp_event; + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + kvfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock_irq(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx5_ib_qp *qp, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + + if (qp->state != IB_QPS_RESET) { + mlx5_ib_qp_disable_pagefaults(qp); + if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), + MLX5_QP_STATE_RST, in, 0, &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + } + + get_cqs(qp, &send_cq, &recv_cq); + + if (qp->create_type == MLX5_QP_KERNEL) { + mlx5_ib_lock_cqs(send_cq, recv_cq); + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + } + + err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); + kfree(in); + + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_general_caps *gen; + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + gen = &dev->mdev->caps.gen; + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) { + mlx5_ib_dbg(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case MLX5_IB_QPT_REG_UMR: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + to_mcq(init_attr->send_cq)->mcq.cqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + struct mlx5_general_caps *gen; + + gen = &dev->mdev->caps.gen; + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + gen->stat_rate_support)) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr) +{ + struct mlx5_general_caps *gen; + int err; + + gen = &dev->mdev->caps.gen; + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; + + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = attr->pkey_index; + + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) { + pr_err("sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, gen->port[port - 1].gid_table_len); + return -EINVAL; + } + path->grh_mlid |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = attr->timeout << 3; + + path->sl = ah->sl & 0xf; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_general_caps *gen; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + + gen = &dev->mdev->caps.gen; + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg; + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = attr->pkey_index; + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, attr_mask, 0, attr); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + /* If moving to a reset or error state, we must disable page faults on + * this QP and flush all current page faults. Otherwise a stale page + * fault may attempt to work on this QP after it is reset and moved + * again to RTS, and may cause the driver and the device to get out of + * sync. */ + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx5_ib_qp_disable_pagefaults(qp); + + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state), + to_mlx5_state(new_state), in, sqd_event, + &qp->mqp); + if (err) + goto out; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + mlx5_ib_qp_enable_pagefaults(qp); + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + +out: + kfree(in); + return err; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct mlx5_general_caps *gen; + int err = -EINVAL; + int port; + + gen = &dev->mdev->caps.gen; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > gen->num_ports)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= gen->port[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp)) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp)) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static __be64 get_umr_reg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_unreg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_mtt_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(umr, 0, sizeof(*umr)); + + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ + else + umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->klm_octowords = get_klm_octo(umrwr->npages); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { + umr->mkey_mask = get_umr_update_mtt_mask(); + umr->bsf_octowords = get_klm_octo(umrwr->target.offset); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } else { + umr->mkey_mask = get_umr_reg_mr_mask(); + } + } else { + umr->mkey_mask = get_umr_unreg_mr_mask(); + } + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = convert_access(umrwr->access_flags); + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); + } + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + set_reg_umr_segment(seg, wr); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + else + __acquire(&bf->lock); + + /* TBD enable WC */ + if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + else + __release(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > dev->caps.gen.num_ports) + return; + + ib_ah_attr->sl = path->sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + + mutex_lock(&qp->mutex); + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + context = &outb->ctx; + err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (err) + goto out_free; + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f; + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out_free: + kfree(outb); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_general_caps *gen; + struct mlx5_ib_xrcd *xrcd; + int err; + + gen = &dev->mdev->caps.gen; + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/srq.c b/kernel/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 000000000..02d77a297 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/mlx5/qp.h> +#include <linux/mlx5/srq.h> +#include <linux/slab.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + int err; + int npages; + int page_shift; + int ncont; + u32 offset; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + return 0; + +err_in: + kvfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + int npages; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + mlx5_ib_dbg(dev, "kmalloc failed %lu\n", + (unsigned long)(srq->msrq.max * sizeof(u64))); + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + return 0; + +err_in: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_general_caps *gen; + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + + gen = &dev->mdev->caps.gen; + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= gen->max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + gen->max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen); + kvfree(in); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/user.h b/kernel/drivers/infiniband/hw/mlx5/user.h new file mode 100644 index 000000000..76fb7b927 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/user.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include <linux/types.h> + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; +#endif /* MLX5_IB_USER_H */ |