summaryrefslogtreecommitdiffstats
path: root/kernel/drivers/infiniband/core
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/infiniband/core
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/drivers/infiniband/core')
-rw-r--r--kernel/drivers/infiniband/core/Makefile3
-rw-r--r--kernel/drivers/infiniband/core/addr.c22
-rw-r--r--kernel/drivers/infiniband/core/agent.c29
-rw-r--r--kernel/drivers/infiniband/core/agent.h6
-rw-r--r--kernel/drivers/infiniband/core/cache.c898
-rw-r--r--kernel/drivers/infiniband/core/cm.c352
-rw-r--r--kernel/drivers/infiniband/core/cma.c1088
-rw-r--r--kernel/drivers/infiniband/core/core_priv.h53
-rw-r--r--kernel/drivers/infiniband/core/device.c416
-rw-r--r--kernel/drivers/infiniband/core/iwpm_msg.c33
-rw-r--r--kernel/drivers/infiniband/core/iwpm_util.c12
-rw-r--r--kernel/drivers/infiniband/core/iwpm_util.h28
-rw-r--r--kernel/drivers/infiniband/core/mad.c723
-rw-r--r--kernel/drivers/infiniband/core/mad_priv.h18
-rw-r--r--kernel/drivers/infiniband/core/mad_rmpp.c33
-rw-r--r--kernel/drivers/infiniband/core/multicast.c30
-rw-r--r--kernel/drivers/infiniband/core/netlink.c55
-rw-r--r--kernel/drivers/infiniband/core/opa_smi.h78
-rw-r--r--kernel/drivers/infiniband/core/roce_gid_mgmt.c747
-rw-r--r--kernel/drivers/infiniband/core/sa_query.c579
-rw-r--r--kernel/drivers/infiniband/core/smi.c245
-rw-r--r--kernel/drivers/infiniband/core/smi.h4
-rw-r--r--kernel/drivers/infiniband/core/sysfs.c61
-rw-r--r--kernel/drivers/infiniband/core/ucm.c16
-rw-r--r--kernel/drivers/infiniband/core/ucma.c186
-rw-r--r--kernel/drivers/infiniband/core/user_mad.c70
-rw-r--r--kernel/drivers/infiniband/core/uverbs.h15
-rw-r--r--kernel/drivers/infiniband/core/uverbs_cmd.c736
-rw-r--r--kernel/drivers/infiniband/core/uverbs_main.c407
-rw-r--r--kernel/drivers/infiniband/core/uverbs_marshall.c4
-rw-r--r--kernel/drivers/infiniband/core/verbs.c573
31 files changed, 5709 insertions, 1811 deletions
diff --git a/kernel/drivers/infiniband/core/Makefile b/kernel/drivers/infiniband/core/Makefile
index acf736764..d43a8994a 100644
--- a/kernel/drivers/infiniband/core/Makefile
+++ b/kernel/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y)
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
- device.o fmr_pool.o cache.o netlink.o
+ device.o fmr_pool.o cache.o netlink.o \
+ roce_gid_mgmt.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
diff --git a/kernel/drivers/infiniband/core/addr.c b/kernel/drivers/infiniband/core/addr.c
index 38339d220..34b1adad0 100644
--- a/kernel/drivers/infiniband/core/addr.c
+++ b/kernel/drivers/infiniband/core/addr.c
@@ -128,7 +128,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
int ret = -EADDRNOTAVAIL;
if (dev_addr->bound_dev_if) {
- dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
if (!dev)
return -ENODEV;
ret = rdma_copy_addr(dev_addr, dev, NULL);
@@ -138,7 +138,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
switch (addr->sa_family) {
case AF_INET:
- dev = ip_dev_find(&init_net,
+ dev = ip_dev_find(dev_addr->net,
((struct sockaddr_in *) addr)->sin_addr.s_addr);
if (!dev)
@@ -149,12 +149,11 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
*vlan_id = rdma_vlan_dev_vlan_id(dev);
dev_put(dev);
break;
-
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev) {
- if (ipv6_chk_addr(&init_net,
+ for_each_netdev_rcu(dev_addr->net, dev) {
+ if (ipv6_chk_addr(dev_addr->net,
&((struct sockaddr_in6 *) addr)->sin6_addr,
dev, 1)) {
ret = rdma_copy_addr(dev_addr, dev, NULL);
@@ -236,7 +235,7 @@ static int addr4_resolve(struct sockaddr_in *src_in,
fl4.daddr = dst_ip;
fl4.saddr = src_ip;
fl4.flowi4_oif = addr->bound_dev_if;
- rt = ip_route_output_key(&init_net, &fl4);
+ rt = ip_route_output_key(addr->net, &fl4);
if (IS_ERR(rt)) {
ret = PTR_ERR(rt);
goto out;
@@ -278,12 +277,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
fl6.saddr = src_in->sin6_addr;
fl6.flowi6_oif = addr->bound_dev_if;
- dst = ip6_route_output(&init_net, NULL, &fl6);
+ dst = ip6_route_output(addr->net, NULL, &fl6);
if ((ret = dst->error))
goto put;
if (ipv6_addr_any(&fl6.saddr)) {
- ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+ ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
&fl6.daddr, 0, &fl6.saddr);
if (ret)
goto put;
@@ -457,8 +456,8 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
complete(&((struct resolve_cb_context *)context)->comp);
}
-int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
- u16 *vlan_id)
+int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
+ u8 *dmac, u16 *vlan_id, int if_index)
{
int ret = 0;
struct rdma_dev_addr dev_addr;
@@ -476,6 +475,8 @@ int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
rdma_gid2ip(&dgid_addr._sockaddr, dgid);
memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.bound_dev_if = if_index;
+ dev_addr.net = &init_net;
ctx.addr = &dev_addr;
init_completion(&ctx.comp);
@@ -510,6 +511,7 @@ int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
rdma_gid2ip(&gid_addr._sockaddr, sgid);
memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.net = &init_net;
ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
if (ret)
return ret;
diff --git a/kernel/drivers/infiniband/core/agent.c b/kernel/drivers/infiniband/core/agent.c
index f6d29614c..4fa524dfb 100644
--- a/kernel/drivers/infiniband/core/agent.c
+++ b/kernel/drivers/infiniband/core/agent.c
@@ -54,7 +54,7 @@ static DEFINE_SPINLOCK(ib_agent_port_list_lock);
static LIST_HEAD(ib_agent_port_list);
static struct ib_agent_port_private *
-__ib_get_agent_port(struct ib_device *device, int port_num)
+__ib_get_agent_port(const struct ib_device *device, int port_num)
{
struct ib_agent_port_private *entry;
@@ -67,7 +67,7 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
}
static struct ib_agent_port_private *
-ib_get_agent_port(struct ib_device *device, int port_num)
+ib_get_agent_port(const struct ib_device *device, int port_num)
{
struct ib_agent_port_private *entry;
unsigned long flags;
@@ -78,9 +78,9 @@ ib_get_agent_port(struct ib_device *device, int port_num)
return entry;
}
-void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
- struct ib_wc *wc, struct ib_device *device,
- int port_num, int qpn)
+void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa)
{
struct ib_agent_port_private *port_priv;
struct ib_mad_agent *agent;
@@ -88,7 +88,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
struct ib_ah *ah;
struct ib_mad_send_wr_private *mad_send_wr;
- if (device->node_type == RDMA_NODE_IB_SWITCH)
+ if (rdma_cap_ib_switch(device))
port_priv = ib_get_agent_port(device, 0);
else
port_priv = ib_get_agent_port(device, port_num);
@@ -106,22 +106,27 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
return;
}
+ if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+ resp_mad_len = IB_MGMT_MAD_SIZE;
+
send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
- IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_KERNEL);
+ IB_MGMT_MAD_HDR,
+ resp_mad_len - IB_MGMT_MAD_HDR,
+ GFP_KERNEL,
+ mad_hdr->base_version);
if (IS_ERR(send_buf)) {
dev_err(&device->dev, "ib_create_send_mad error\n");
goto err1;
}
- memcpy(send_buf->mad, mad, sizeof *mad);
+ memcpy(send_buf->mad, mad_hdr, resp_mad_len);
send_buf->ah = ah;
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ if (rdma_cap_ib_switch(device)) {
mad_send_wr = container_of(send_buf,
struct ib_mad_send_wr_private,
send_buf);
- mad_send_wr->send_wr.wr.ud.port_num = port_num;
+ mad_send_wr->send_wr.port_num = port_num;
}
if (ib_post_send_mad(send_buf, NULL)) {
@@ -156,7 +161,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
goto error1;
}
- if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_smi(device, port_num)) {
/* Obtain send only MAD agent for SMI QP */
port_priv->agent[0] = ib_register_mad_agent(device, port_num,
IB_QPT_SMI, NULL, 0,
diff --git a/kernel/drivers/infiniband/core/agent.h b/kernel/drivers/infiniband/core/agent.h
index 666928700..65f92beda 100644
--- a/kernel/drivers/infiniband/core/agent.h
+++ b/kernel/drivers/infiniband/core/agent.h
@@ -44,8 +44,8 @@ extern int ib_agent_port_open(struct ib_device *device, int port_num);
extern int ib_agent_port_close(struct ib_device *device, int port_num);
-extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
- struct ib_wc *wc, struct ib_device *device,
- int port_num, int qpn);
+extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa);
#endif /* __AGENT_H_ */
diff --git a/kernel/drivers/infiniband/core/cache.c b/kernel/drivers/infiniband/core/cache.c
index 80f6cf244..89bebeada 100644
--- a/kernel/drivers/infiniband/core/cache.c
+++ b/kernel/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
#include <rdma/ib_cache.h>
@@ -47,90 +49,720 @@ struct ib_pkey_cache {
u16 table[0];
};
-struct ib_gid_cache {
- int table_len;
- union ib_gid table[0];
-};
-
struct ib_update_work {
struct work_struct work;
struct ib_device *device;
u8 port_num;
};
-static inline int start_port(struct ib_device *device)
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+};
+
+enum gid_table_entry_props {
+ GID_TABLE_ENTRY_INVALID = 1UL << 0,
+ GID_TABLE_ENTRY_DEFAULT = 1UL << 1,
+};
+
+enum gid_table_write_action {
+ GID_TABLE_WRITE_ACTION_ADD,
+ GID_TABLE_WRITE_ACTION_DEL,
+ /* MODIFY only updates the GID table. Currently only used by
+ * ib_cache_update.
+ */
+ GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+ /* This lock protects an entry from being
+ * read and written simultaneously.
+ */
+ rwlock_t lock;
+ unsigned long props;
+ union ib_gid gid;
+ struct ib_gid_attr attr;
+ void *context;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ * Add/delete should be carried out atomically.
+ * This is done by locking this mutex from multiple
+ * writers. We don't need this lock for IB, as the MAD
+ * layer replaces all entries. All data_vec entries
+ * are locked by this lock.
+ **/
+ struct mutex lock;
+ struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ enum gid_table_write_action action,
+ bool default_gid)
+{
+ int ret = 0;
+ struct net_device *old_net_dev;
+ unsigned long flags;
+
+ /* in rdma_cap_roce_gid_table, this funciton should be protected by a
+ * sleep-able lock.
+ */
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+ /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+ * RoCE providers and thus only updates the cache.
+ */
+ if (action == GID_TABLE_WRITE_ACTION_ADD)
+ ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+ &table->data_vec[ix].context);
+ else if (action == GID_TABLE_WRITE_ACTION_DEL)
+ ret = ib_dev->del_gid(ib_dev, port, ix,
+ &table->data_vec[ix].context);
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+ }
+
+ old_net_dev = table->data_vec[ix].attr.ndev;
+ if (old_net_dev && old_net_dev != attr->ndev)
+ dev_put(old_net_dev);
+ /* if modify_gid failed, just delete the old gid */
+ if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+ gid = &zgid;
+ attr = &zattr;
+ table->data_vec[ix].context = NULL;
+ }
+ if (default_gid)
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+ memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+ memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+ if (table->data_vec[ix].attr.ndev &&
+ table->data_vec[ix].attr.ndev != old_net_dev)
+ dev_hold(table->data_vec[ix].attr.ndev);
+
+ table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+ if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+ return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+ GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask)
{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+ int i;
+
+ for (i = 0; i < table->sz; i++) {
+ unsigned long flags;
+ struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+ read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+ if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+ default_gid)
+ goto next;
+
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ return i;
+next:
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ }
+
+ return -1;
}
-static inline int end_port(struct ib_device *device)
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ?
- 0 : device->phys_port_cnt;
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
}
-int ib_get_cached_gid(struct ib_device *device,
- u8 port_num,
- int index,
- union ib_gid *gid)
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
{
- struct ib_gid_cache *cache;
- unsigned long flags;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
int ret = 0;
+ struct net_device *idev;
- if (port_num < start_port(device) || port_num > end_port(device))
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (!memcmp(gid, &zgid, sizeof(*gid)))
return -EINVAL;
- read_lock_irqsave(&device->cache.lock, flags);
+ if (ib_dev->get_netdev) {
+ idev = ib_dev->get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
- cache = device->cache.gid_cache[port_num - start_port(device)];
+ /* Adding default GIDs in not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+ dev_put(idev);
+ return -EPERM;
+ }
+ }
+ if (idev)
+ dev_put(idev);
+ }
- if (index < 0 || index >= cache->table_len)
- ret = -EINVAL;
- else
- *gid = cache->table[index];
+ mutex_lock(&table->lock);
- read_unlock_irqrestore(&device->cache.lock, flags);
+ ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV);
+ if (ix >= 0)
+ goto out_unlock;
+
+ ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+ add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
return ret;
}
-EXPORT_SYMBOL(ib_get_cached_gid);
-int ib_find_cached_gid(struct ib_device *device,
- union ib_gid *gid,
- u8 *port_num,
- u16 *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, false,
+ GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0)
+ goto out_unlock;
+
+ del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
{
- struct ib_gid_cache *cache;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ for (ix = 0; ix < table->sz; ix++)
+ if (table->data_vec[ix].attr.ndev == ndev)
+ del_gid(ib_dev, port, table, ix, false);
+
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
unsigned long flags;
- int p, i;
- int ret = -ENOENT;
- *port_num = -1;
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (index < 0 || index >= table->sz)
+ return -EINVAL;
+
+ read_lock_irqsave(&table->data_vec[index].lock, flags);
+ if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return -EAGAIN;
+ }
+
+ memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+ if (attr) {
+ memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+ if (attr->ndev)
+ dev_hold(attr->ndev);
+ }
+
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *val,
+ unsigned long mask,
+ u8 *port, u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ u8 p;
+ int local_index;
+
+ for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+ table = ports_table[p];
+ local_index = find_gid(table, gid, val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ if (port)
+ *port = p + rdma_start_port(ib_dev);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ struct net_device *ndev, u8 *port,
+ u16 *index)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+ mask, port, index);
+}
+
+int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port, struct net_device *ndev,
+ u16 *index)
+{
+ int local_index;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr val = {.ndev = ndev};
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev))
+ return -ENOENT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ local_index = find_gid(table, gid, &val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_cached_gid_by_port);
+
+/**
+ * ib_find_gid_by_filter - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value could be
+ * searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ * If the filter function returns true, the corresponding index is returned,
+ * otherwise, we continue searching the GID table. It's guaranteed that
+ * while filter is executed, ndev field is valid and the structure won't
+ * change. filter is executed in an atomic context. filter must not be NULL.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_cache_gid_find_by_filter() searches for the specified GID value
+ * of which the filter function returns true in the port's GID table.
+ * This function is only supported on RoCE ports.
+ *
+ */
+static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port,
+ bool (*filter)(const union ib_gid *,
+ const struct ib_gid_attr *,
+ void *),
+ void *context,
+ u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned int i;
+ bool found = false;
+
+ if (!ports_table)
+ return -EOPNOTSUPP;
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev) ||
+ !rdma_protocol_roce(ib_dev, port))
+ return -EPROTONOSUPPORT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ for (i = 0; i < table->sz; i++) {
+ struct ib_gid_attr attr;
+ unsigned long flags;
+
+ read_lock_irqsave(&table->data_vec[i].lock, flags);
+ if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ goto next;
+
+ if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ goto next;
+
+ memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
+
+ if (filter(gid, &attr, context))
+ found = true;
+
+next:
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+
+ if (found)
+ break;
+ }
+
+ if (!found)
+ return -ENOENT;
+
if (index)
- *index = -1;
+ *index = i;
+ return 0;
+}
- read_lock_irqsave(&device->cache.lock, flags);
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ unsigned int i;
+ struct ib_gid_table *table =
+ kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+ if (!table)
+ return NULL;
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- cache = device->cache.gid_cache[p];
- for (i = 0; i < cache->table_len; ++i) {
- if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
- *port_num = p + start_port(device);
- if (index)
- *index = i;
- ret = 0;
- goto found;
- }
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+
+ for (i = 0; i < sz; i++)
+ rwlock_init(&table->data_vec[i].lock);
+
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+ if (table) {
+ kfree(table->data_vec);
+ kfree(table);
+ }
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; ++i) {
+ if (memcmp(&table->data_vec[i].gid, &zgid,
+ sizeof(table->data_vec[i].gid)))
+ del_gid(ib_dev, port, table, i,
+ table->data_vec[i].props &
+ GID_ATTR_FIND_MASK_DEFAULT);
+ }
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ struct ib_gid_table *table;
+ int ix;
+ union ib_gid current_gid;
+ struct ib_gid_attr current_gid_attr = {};
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ make_default_gid(ndev, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ mutex_lock(&table->lock);
+ ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+ /* Coudn't find default GID location */
+ WARN_ON(ix < 0);
+
+ if (!__ib_cache_gid_get(ib_dev, port, ix,
+ &current_gid, &current_gid_attr) &&
+ mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+ !memcmp(&gid, &current_gid, sizeof(gid)) &&
+ !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+ goto unlock;
+
+ if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+ memcmp(&current_gid_attr, &zattr,
+ sizeof(current_gid_attr))) &&
+ del_gid(ib_dev, port, table, ix, true)) {
+ pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+ ix, gid.raw);
+ goto unlock;
+ }
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+ if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+ pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+ gid.raw);
+
+unlock:
+ if (current_gid_attr.ndev)
+ dev_put(current_gid_attr.ndev);
+ mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+ entry->props |= GID_TABLE_ENTRY_DEFAULT;
+ }
+
+ return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ u8 port;
+ struct ib_gid_table **table;
+ int err = 0;
+
+ table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+ if (!table) {
+ pr_warn("failed to allocate ib gid cache for %s\n",
+ ib_dev->name);
+ return -ENOMEM;
+ }
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ u8 rdma_port = port + rdma_start_port(ib_dev);
+
+ table[port] =
+ alloc_gid_table(
+ ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ if (!table[port]) {
+ err = -ENOMEM;
+ goto rollback_table_setup;
}
+
+ err = gid_table_reserve_default(ib_dev,
+ port + rdma_start_port(ib_dev),
+ table[port]);
+ if (err)
+ goto rollback_table_setup;
}
-found:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
+ ib_dev->cache.gid_cache = table;
+ return 0;
+
+rollback_table_setup:
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+ release_gid_table(table[port]);
+ }
+
+ kfree(table);
+ return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ release_gid_table(table[port]);
+
+ kfree(table);
+ ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ err = roce_rescan_device(ib_dev);
+
+ if (err) {
+ gid_table_cleanup_one(ib_dev);
+ gid_table_release_one(ib_dev);
+ }
+
+ return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ return __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ struct net_device *ndev,
+ u8 *port_num,
+ u16 *index)
+{
+ return ib_cache_gid_find(device, gid, ndev, port_num, index);
}
EXPORT_SYMBOL(ib_find_cached_gid);
+int ib_find_gid_by_filter(struct ib_device *device,
+ const union ib_gid *gid,
+ u8 port_num,
+ bool (*filter)(const union ib_gid *gid,
+ const struct ib_gid_attr *,
+ void *),
+ void *context, u16 *index)
+{
+ /* Only RoCE GID table supports filter function */
+ if (!rdma_cap_roce_gid_table(device, port_num) && filter)
+ return -EPROTONOSUPPORT;
+
+ return ib_cache_gid_find_by_filter(device, gid,
+ port_num, filter,
+ context, index);
+}
+EXPORT_SYMBOL(ib_find_gid_by_filter);
+
int ib_get_cached_pkey(struct ib_device *device,
u8 port_num,
int index,
@@ -140,12 +772,12 @@ int ib_get_cached_pkey(struct ib_device *device,
unsigned long flags;
int ret = 0;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.pkey_cache[port_num - start_port(device)];
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
if (index < 0 || index >= cache->table_len)
ret = -EINVAL;
@@ -169,12 +801,12 @@ int ib_find_cached_pkey(struct ib_device *device,
int ret = -ENOENT;
int partial_ix = -1;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.pkey_cache[port_num - start_port(device)];
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
*index = -1;
@@ -209,12 +841,12 @@ int ib_find_exact_cached_pkey(struct ib_device *device,
int i;
int ret = -ENOENT;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.pkey_cache[port_num - start_port(device)];
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
*index = -1;
@@ -238,11 +870,11 @@ int ib_get_cached_lmc(struct ib_device *device,
unsigned long flags;
int ret = 0;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)];
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -254,9 +886,21 @@ static void ib_cache_update(struct ib_device *device,
{
struct ib_port_attr *tprops = NULL;
struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
- struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+ } *gid_cache = NULL;
int i;
int ret;
+ struct ib_gid_table *table;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ bool use_roce_gid_table =
+ rdma_cap_roce_gid_table(device, port);
+
+ if (port < rdma_start_port(device) || port > rdma_end_port(device))
+ return;
+
+ table = ports_table[port - rdma_start_port(device)];
tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
if (!tprops)
@@ -276,12 +920,14 @@ static void ib_cache_update(struct ib_device *device,
pkey_cache->table_len = tprops->pkey_tbl_len;
- gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
- sizeof *gid_cache->table, GFP_KERNEL);
- if (!gid_cache)
- goto err;
+ if (!use_roce_gid_table) {
+ gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+ sizeof(*gid_cache->table), GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
- gid_cache->table_len = tprops->gid_tbl_len;
+ gid_cache->table_len = tprops->gid_tbl_len;
+ }
for (i = 0; i < pkey_cache->table_len; ++i) {
ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -292,29 +938,36 @@ static void ib_cache_update(struct ib_device *device,
}
}
- for (i = 0; i < gid_cache->table_len; ++i) {
- ret = ib_query_gid(device, port, i, gid_cache->table + i);
- if (ret) {
- printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
- ret, device->name, i);
- goto err;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i,
+ gid_cache->table + i, NULL);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
}
}
write_lock_irq(&device->cache.lock);
- old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
- old_gid_cache = device->cache.gid_cache [port - start_port(device)];
+ old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
- device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
- device->cache.gid_cache [port - start_port(device)] = gid_cache;
+ device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; i++) {
+ modify_gid(device, port, table, i, gid_cache->table + i,
+ &zattr, false);
+ }
+ }
- device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+ device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
write_unlock_irq(&device->cache.lock);
+ kfree(gid_cache);
kfree(old_pkey_cache);
- kfree(old_gid_cache);
kfree(tprops);
return;
@@ -355,85 +1008,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
}
}
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
{
int p;
+ int err;
rwlock_init(&device->cache.lock);
device->cache.pkey_cache =
- kmalloc(sizeof *device->cache.pkey_cache *
- (end_port(device) - start_port(device) + 1), GFP_KERNEL);
- device->cache.gid_cache =
- kmalloc(sizeof *device->cache.gid_cache *
- (end_port(device) - start_port(device) + 1), GFP_KERNEL);
-
+ kzalloc(sizeof *device->cache.pkey_cache *
+ (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
- (end_port(device) -
- start_port(device) + 1),
+ (rdma_end_port(device) -
+ rdma_start_port(device) + 1),
GFP_KERNEL);
-
- if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ if (!device->cache.pkey_cache ||
!device->cache.lmc_cache) {
printk(KERN_WARNING "Couldn't allocate cache "
"for %s\n", device->name);
- goto err;
+ return -ENOMEM;
}
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- device->cache.pkey_cache[p] = NULL;
- device->cache.gid_cache [p] = NULL;
- ib_cache_update(device, p + start_port(device));
- }
+ err = gid_table_setup_one(device);
+ if (err)
+ /* Allocated memory will be cleaned in the release function */
+ return err;
+
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ ib_cache_update(device, p + rdma_start_port(device));
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
- if (ib_register_event_handler(&device->cache.event_handler))
- goto err_cache;
-
- return;
+ err = ib_register_event_handler(&device->cache.event_handler);
+ if (err)
+ goto err;
-err_cache:
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
+ return 0;
err:
- kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
- kfree(device->cache.lmc_cache);
+ gid_table_cleanup_one(device);
+ return err;
}
-static void ib_cache_cleanup_one(struct ib_device *device)
+void ib_cache_release_one(struct ib_device *device)
{
int p;
- ib_unregister_event_handler(&device->cache.event_handler);
- flush_workqueue(ib_wq);
-
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
-
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ if (device->cache.pkey_cache)
+ for (p = 0;
+ p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ kfree(device->cache.pkey_cache[p]);
+
+ gid_table_release_one(device);
kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
kfree(device->cache.lmc_cache);
}
-static struct ib_client cache_client = {
- .name = "cache",
- .add = ib_cache_setup_one,
- .remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function unregisters the event handler,
+ * waits for all in-progress workqueue elements and cleans
+ * up the GID cache. This function should be called after
+ * the device was removed from the devices list and all
+ * clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+}
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
{
- return ib_register_client(&cache_client);
+ roce_gid_mgmt_init();
}
void __exit ib_cache_cleanup(void)
{
- ib_unregister_client(&cache_client);
+ roce_gid_mgmt_cleanup();
}
diff --git a/kernel/drivers/infiniband/core/cm.c b/kernel/drivers/infiniband/core/cm.c
index 0271608a5..d6d2b3582 100644
--- a/kernel/drivers/infiniband/core/cm.c
+++ b/kernel/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cm_client = {
.name = "cm",
@@ -169,6 +169,7 @@ struct cm_device {
struct ib_device *ib_device;
struct device *device;
u8 ack_delay;
+ int going_down;
struct cm_port *port[0];
};
@@ -178,8 +179,6 @@ struct cm_av {
struct ib_ah_attr ah_attr;
u16 pkey_index;
u8 timeout;
- u8 valid;
- u8 smac[ETH_ALEN];
};
struct cm_work {
@@ -212,13 +211,15 @@ struct cm_id_private {
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
atomic_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock. */
+ int listen_sharecount;
struct ib_mad_send_buf *msg;
struct cm_timewait_info *timewait_info;
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
- struct ib_cm_compare_data *compare_data;
void *private_data;
__be64 tid;
@@ -267,7 +268,8 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
cm_id_priv->av.pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC);
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
ib_destroy_ah(ah);
return PTR_ERR(m);
@@ -297,7 +299,8 @@ static int cm_alloc_response_msg(struct cm_port *port,
m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC);
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
ib_destroy_ah(ah);
return PTR_ERR(m);
@@ -356,17 +359,21 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
unsigned long flags;
int ret;
u8 p;
+ struct net_device *ndev = ib_get_ndev_from_path(path);
read_lock_irqsave(&cm.device_lock, flags);
list_for_each_entry(cm_dev, &cm.device_list, list) {
if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
- &p, NULL)) {
+ ndev, &p, NULL)) {
port = cm_dev->port[p-1];
break;
}
}
read_unlock_irqrestore(&cm.device_lock, flags);
+ if (ndev)
+ dev_put(ndev);
+
if (!port)
return -EINVAL;
@@ -379,9 +386,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
&av->ah_attr);
av->timeout = path->packet_life_time + 1;
- memcpy(av->smac, path->smac, sizeof(av->smac));
- av->valid = 1;
return 0;
}
@@ -437,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
return cm_id_priv;
}
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
- int i;
-
- for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
- dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
- u32 dst[IB_CM_COMPARE_SIZE];
-
- if (!src_data || !dst_data)
- return 0;
-
- cm_mask_copy(src, src_data->data, dst_data->mask);
- cm_mask_copy(dst, dst_data->data, src_data->mask);
- return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
-
- if (!dst_data)
- return 0;
-
- cm_mask_copy(src, private_data, dst_data->mask);
- return memcmp(src, dst_data->data, sizeof(src));
-}
-
/*
* Trivial helpers to strip endian annotation and compare; the
* endianness doesn't actually matter since we just need a stable
@@ -503,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
__be64 service_mask = cm_id_priv->id.service_mask;
- int data_cmp;
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
- data_cmp = cm_compare_data(cm_id_priv->compare_data,
- cur_cm_id_priv->compare_data);
if ((cur_cm_id_priv->id.service_mask & service_id) ==
(service_mask & cur_cm_id_priv->id.service_id) &&
- (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
- !data_cmp)
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device))
return cur_cm_id_priv;
if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -525,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
link = &(*link)->rb_left;
else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_right;
- else if (data_cmp < 0)
- link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
}
@@ -536,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
}
static struct cm_id_private * cm_find_listen(struct ib_device *device,
- __be64 service_id,
- u32 *private_data)
+ __be64 service_id)
{
struct rb_node *node = cm.listen_service_table.rb_node;
struct cm_id_private *cm_id_priv;
- int data_cmp;
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
- data_cmp = cm_compare_private_data(private_data,
- cm_id_priv->compare_data);
if ((cm_id_priv->id.service_mask & service_id) ==
cm_id_priv->id.service_id &&
- (cm_id_priv->id.device == device) && !data_cmp)
+ (cm_id_priv->id.device == device))
return cm_id_priv;
if (device < cm_id_priv->id.device)
@@ -560,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
- else if (data_cmp < 0)
- node = node->rb_left;
else
node = node->rb_right;
}
@@ -803,6 +762,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
{
int wait_time;
unsigned long flags;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
+ if (!cm_dev)
+ return;
spin_lock_irqsave(&cm.lock, flags);
cm_cleanup_timewait(cm_id_priv->timewait_info);
@@ -816,8 +780,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
*/
cm_id_priv->id.state = IB_CM_TIMEWAIT;
wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
- queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
- msecs_to_jiffies(wait_time));
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down)
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ spin_unlock_irqrestore(&cm.lock, flags);
+
cm_id_priv->timewait_info = NULL;
}
@@ -845,9 +815,15 @@ retest:
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id->state) {
case IB_CM_LISTEN:
- cm_id->state = IB_CM_IDLE;
spin_unlock_irq(&cm_id_priv->lock);
+
spin_lock_irq(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ cm_deref_id(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ return;
+ }
rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
spin_unlock_irq(&cm.lock);
break;
@@ -859,6 +835,11 @@ retest:
case IB_CM_SIDR_REQ_RCVD:
spin_unlock_irq(&cm_id_priv->lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+ spin_lock_irq(&cm.lock);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+ rb_erase(&cm_id_priv->sidr_id_node,
+ &cm.remote_sidr_table);
+ spin_unlock_irq(&cm.lock);
break;
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
@@ -916,7 +897,6 @@ retest:
wait_for_completion(&cm_id_priv->comp);
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
- kfree(cm_id_priv->compare_data);
kfree(cm_id_priv->private_data);
kfree(cm_id_priv);
}
@@ -927,11 +907,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask)
{
struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
- unsigned long flags;
int ret = 0;
service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -944,20 +936,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
if (cm_id->state != IB_CM_IDLE)
return -EINVAL;
- if (compare_data) {
- cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
- GFP_KERNEL);
- if (!cm_id_priv->compare_data)
- return -ENOMEM;
- cm_mask_copy(cm_id_priv->compare_data->data,
- compare_data->data, compare_data->mask);
- memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
- sizeof(compare_data->mask));
- }
-
cm_id->state = IB_CM_LISTEN;
+ ++cm_id_priv->listen_sharecount;
- spin_lock_irqsave(&cm.lock, flags);
if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
cm_id->service_mask = ~cpu_to_be64(0);
@@ -966,18 +947,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->service_mask = service_mask;
}
cur_cm_id_priv = cm_insert_listen(cm_id_priv);
- spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_id->state = IB_CM_IDLE;
- kfree(cm_id_priv->compare_data);
- cm_id_priv->compare_data = NULL;
+ --cm_id_priv->listen_sharecount;
ret = -EBUSY;
}
return ret;
}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = __ib_cm_listen(cm_id, service_id, service_mask);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
+}
EXPORT_SYMBOL(ib_cm_listen);
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ goto new_id;
+
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id);
+ if (cm_id_priv) {
+ if (cm_id->cm_handler != cm_handler || cm_id->context) {
+ /* Sharing an ib_cm_id with different handlers is not
+ * supported */
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return ERR_PTR(-EINVAL);
+ }
+ atomic_inc(&cm_id_priv->refcount);
+ ++cm_id_priv->listen_sharecount;
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ ib_destroy_cm_id(cm_id);
+ cm_id = &cm_id_priv->id;
+ return cm_id;
+ }
+
+new_id:
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, 0);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
enum cm_msg_sequence msg_seq)
{
@@ -1254,6 +1312,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
primary_path->packet_life_time =
cm_req_get_primary_local_ack_timeout(req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
if (req_msg->alt_local_lid) {
memset(alt_path, 0, sizeof *alt_path);
@@ -1275,7 +1334,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
alt_path->packet_life_time =
cm_req_get_alt_local_ack_timeout(req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
+ }
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
}
+
+ return pkey;
}
static void cm_format_req_event(struct cm_work *work,
@@ -1288,6 +1366,7 @@ static void cm_format_req_event(struct cm_work *work,
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = cm_id_priv->av.port->port_num;
param->primary_path = &work->path[0];
if (req_msg->alt_local_lid)
@@ -1470,8 +1549,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
/* Find matching listen request. */
listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
- req_msg->service_id,
- req_msg->private_data);
+ req_msg->service_id);
if (!listen_cm_id_priv) {
cm_cleanup_timewait(cm_id_priv->timewait_info);
spin_unlock_irq(&cm.lock);
@@ -1561,11 +1639,11 @@ static int cm_req_handler(struct cm_work *work)
cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
- work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
if (ret) {
ib_get_cached_gid(work->port->cm_dev->ib_device,
- work->port->port_num, 0, &work->path[0].sgid);
+ work->port->port_num, 0, &work->path[0].sgid,
+ NULL);
ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
&work->path[0].sgid, sizeof work->path[0].sgid,
NULL, 0);
@@ -2978,6 +3056,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = work->port->port_num;
work->cm_event.private_data = &sidr_req_msg->private_data;
}
@@ -3017,8 +3097,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
}
cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
cur_cm_id_priv = cm_find_listen(cm_id->device,
- sidr_req_msg->service_id,
- sidr_req_msg->private_data);
+ sidr_req_msg->service_id);
if (!cur_cm_id_priv) {
spin_unlock_irq(&cm.lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3098,7 +3177,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
spin_lock_irqsave(&cm.lock, flags);
- rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+ }
spin_unlock_irqrestore(&cm.lock, flags);
return 0;
@@ -3303,6 +3385,11 @@ static int cm_establish(struct ib_cm_id *cm_id)
struct cm_work *work;
unsigned long flags;
int ret = 0;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id->device, &cm_client);
+ if (!cm_dev)
+ return -ENODEV;
work = kmalloc(sizeof *work, GFP_ATOMIC);
if (!work)
@@ -3341,7 +3428,17 @@ static int cm_establish(struct ib_cm_id *cm_id)
work->remote_id = cm_id->remote_id;
work->mad_recv_wc = NULL;
work->cm_event.event = IB_CM_USER_ESTABLISHED;
- queue_delayed_work(cm.wq, &work->work, 0);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irq(&cm.lock);
+ if (!cm_dev->going_down) {
+ queue_delayed_work(cm.wq, &work->work, 0);
+ } else {
+ kfree(work);
+ ret = -ENODEV;
+ }
+ spin_unlock_irq(&cm.lock);
+
out:
return ret;
}
@@ -3392,6 +3489,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
enum ib_cm_event_type event;
u16 attr_id;
int paths = 0;
+ int going_down = 0;
switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
case CM_REQ_ATTR_ID:
@@ -3450,7 +3548,19 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
work->cm_event.event = event;
work->mad_recv_wc = mad_recv_wc;
work->port = port;
- queue_delayed_work(cm.wq, &work->work, 0);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irq(&cm.lock);
+ if (!port->cm_dev->going_down)
+ queue_delayed_work(cm.wq, &work->work, 0);
+ else
+ going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ if (going_down) {
+ kfree(work);
+ ib_free_recv_mad(mad_recv_wc);
+ }
}
static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
@@ -3508,32 +3618,6 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
IB_QP_DEST_QPN | IB_QP_RQ_PSN;
qp_attr->ah_attr = cm_id_priv->av.ah_attr;
- if (!cm_id_priv->av.valid) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return -EINVAL;
- }
- if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
- qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
- *qp_attr_mask |= IB_QP_VID;
- }
- if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
- memcpy(qp_attr->smac, cm_id_priv->av.smac,
- sizeof(qp_attr->smac));
- *qp_attr_mask |= IB_QP_SMAC;
- }
- if (cm_id_priv->alt_av.valid) {
- if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
- qp_attr->alt_vlan_id =
- cm_id_priv->alt_av.ah_attr.vlan_id;
- *qp_attr_mask |= IB_QP_ALT_VID;
- }
- if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
- memcpy(qp_attr->alt_smac,
- cm_id_priv->alt_av.smac,
- sizeof(qp_attr->alt_smac));
- *qp_attr_mask |= IB_QP_ALT_SMAC;
- }
- }
qp_attr->path_mtu = cm_id_priv->path_mtu;
qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
@@ -3759,11 +3843,9 @@ static void cm_add_one(struct ib_device *ib_device)
};
unsigned long flags;
int ret;
+ int count = 0;
u8 i;
- if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
ib_device->phys_port_cnt, GFP_KERNEL);
if (!cm_dev)
@@ -3771,7 +3853,7 @@ static void cm_add_one(struct ib_device *ib_device)
cm_dev->ib_device = ib_device;
cm_get_ack_delay(cm_dev);
-
+ cm_dev->going_down = 0;
cm_dev->device = device_create(&cm_class, &ib_device->dev,
MKDEV(0, 0), NULL,
"%s", ib_device->name);
@@ -3782,6 +3864,9 @@ static void cm_add_one(struct ib_device *ib_device)
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port)
goto error1;
@@ -3808,7 +3893,13 @@ static void cm_add_one(struct ib_device *ib_device)
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
goto error3;
+
+ count++;
}
+
+ if (!count)
+ goto free;
+
ib_set_client_data(ib_device, &cm_client, cm_dev);
write_lock_irqsave(&cm.device_lock, flags);
@@ -3824,18 +3915,22 @@ error1:
port_modify.set_port_cap_mask = 0;
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
}
+free:
device_unregister(cm_dev->device);
kfree(cm_dev);
}
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
{
- struct cm_device *cm_dev;
+ struct cm_device *cm_dev = client_data;
struct cm_port *port;
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3843,7 +3938,6 @@ static void cm_remove_one(struct ib_device *ib_device)
unsigned long flags;
int i;
- cm_dev = ib_get_client_data(ib_device, &cm_client);
if (!cm_dev)
return;
@@ -3851,11 +3945,23 @@ static void cm_remove_one(struct ib_device *ib_device)
list_del(&cm_dev->list);
write_unlock_irqrestore(&cm.device_lock, flags);
+ spin_lock_irq(&cm.lock);
+ cm_dev->going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
- ib_unregister_mad_agent(port->mad_agent);
+ /*
+ * We flush the queue here after the going_down set, this
+ * verify that no new works will be queued in the recv handler,
+ * after that we can call the unregister_mad_agent
+ */
flush_workqueue(cm.wq);
+ ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
}
device_unregister(cm_dev->device);
diff --git a/kernel/drivers/infiniband/core/cma.c b/kernel/drivers/infiniband/core/cma.c
index 38ffe0981..17a15c560 100644
--- a/kernel/drivers/infiniband/core/cma.c
+++ b/kernel/drivers/infiniband/core/cma.c
@@ -44,8 +44,12 @@
#include <linux/module.h>
#include <net/route.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
@@ -65,8 +69,36 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 18
+static const char * const cma_events[] = {
+ [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved",
+ [RDMA_CM_EVENT_ADDR_ERROR] = "address error",
+ [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ",
+ [RDMA_CM_EVENT_ROUTE_ERROR] = "route error",
+ [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request",
+ [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
+ [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error",
+ [RDMA_CM_EVENT_UNREACHABLE] = "unreachable",
+ [RDMA_CM_EVENT_REJECTED] = "rejected",
+ [RDMA_CM_EVENT_ESTABLISHED] = "established",
+ [RDMA_CM_EVENT_DISCONNECTED] = "disconnected",
+ [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal",
+ [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join",
+ [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error",
+ [RDMA_CM_EVENT_ADDR_CHANGE] = "address change",
+ [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit",
+};
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
+ cma_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(rdma_event_msg);
+
static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cma_client = {
.name = "cma",
@@ -80,10 +112,37 @@ static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
static struct workqueue_struct *cma_wq;
-static DEFINE_IDR(tcp_ps);
-static DEFINE_IDR(udp_ps);
-static DEFINE_IDR(ipoib_ps);
-static DEFINE_IDR(ib_ps);
+static int cma_pernet_id;
+
+struct cma_pernet {
+ struct idr tcp_ps;
+ struct idr udp_ps;
+ struct idr ipoib_ps;
+ struct idr ib_ps;
+};
+
+static struct cma_pernet *cma_pernet(struct net *net)
+{
+ return net_generic(net, cma_pernet_id);
+}
+
+static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &pernet->tcp_ps;
+ case RDMA_PS_UDP:
+ return &pernet->udp_ps;
+ case RDMA_PS_IPOIB:
+ return &pernet->ipoib_ps;
+ case RDMA_PS_IB:
+ return &pernet->ib_ps;
+ default:
+ return NULL;
+ }
+}
struct cma_device {
struct list_head list;
@@ -94,11 +153,34 @@ struct cma_device {
};
struct rdma_bind_list {
- struct idr *ps;
+ enum rdma_port_space ps;
struct hlist_head owners;
unsigned short port;
};
+static int cma_ps_alloc(struct net *net, enum rdma_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct net *net,
+ enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(struct net *net, enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ idr_remove(idr, snum);
+}
+
enum {
CMA_OPTION_AFONLY,
};
@@ -197,6 +279,15 @@ struct cma_hdr {
#define CMA_VERSION 0x00
+struct cma_req_info {
+ struct ib_device *device;
+ int port;
+ union ib_gid local_gid;
+ __be64 service_id;
+ u16 pkey;
+ bool has_gid:1;
+};
+
static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
{
unsigned long flags;
@@ -234,7 +325,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
return old;
}
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
{
return hdr->ip_version >> 4;
}
@@ -349,18 +440,40 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
return ret;
}
+static inline int cma_validate_port(struct ib_device *device, u8 port,
+ union ib_gid *gid, int dev_type,
+ int bound_if_index)
+{
+ int ret = -ENODEV;
+ struct net_device *ndev = NULL;
+
+ if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
+ return ret;
+
+ if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
+ return ret;
+
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port))
+ ndev = dev_get_by_index(&init_net, bound_if_index);
+
+ ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL);
+
+ if (ndev)
+ dev_put(ndev);
+
+ return ret;
+}
+
static int cma_acquire_dev(struct rdma_id_private *id_priv,
struct rdma_id_private *listen_id_priv)
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
struct cma_device *cma_dev;
- union ib_gid gid, iboe_gid;
+ union ib_gid gid, iboe_gid, *gidp;
int ret = -ENODEV;
- u8 port, found_port;
- enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
- IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+ u8 port;
- if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB)
return -EINVAL;
@@ -370,41 +483,38 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
memcpy(&gid, dev_addr->src_dev_addr +
rdma_addr_gid_offset(dev_addr), sizeof gid);
- if (listen_id_priv &&
- rdma_port_get_link_layer(listen_id_priv->id.device,
- listen_id_priv->id.port_num) == dev_ll) {
+
+ if (listen_id_priv) {
cma_dev = listen_id_priv->cma_dev;
port = listen_id_priv->id.port_num;
- if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
- ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
- &found_port, NULL);
- else
- ret = ib_find_cached_gid(cma_dev->device, &gid,
- &found_port, NULL);
-
- if (!ret && (port == found_port)) {
- id_priv->id.port_num = found_port;
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+
+ ret = cma_validate_port(cma_dev->device, port, gidp,
+ dev_addr->dev_type,
+ dev_addr->bound_dev_if);
+ if (!ret) {
+ id_priv->id.port_num = port;
goto out;
}
}
+
list_for_each_entry(cma_dev, &dev_list, list) {
for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
if (listen_id_priv &&
listen_id_priv->cma_dev == cma_dev &&
listen_id_priv->id.port_num == port)
continue;
- if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
- if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
- ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
- else
- ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
-
- if (!ret && (port == found_port)) {
- id_priv->id.port_num = found_port;
- goto out;
- }
+
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+
+ ret = cma_validate_port(cma_dev->device, port, gidp,
+ dev_addr->dev_type,
+ dev_addr->bound_dev_if);
+ if (!ret) {
+ id_priv->id.port_num = port;
+ goto out;
}
}
}
@@ -435,14 +545,16 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
pkey = ntohs(addr->sib_pkey);
list_for_each_entry(cur_dev, &dev_list, list) {
- if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
- continue;
-
for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!rdma_cap_af_ib(cur_dev->device, p))
+ continue;
+
if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
continue;
- for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) {
+ for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i,
+ &gid, NULL);
+ i++) {
if (!memcmp(&gid, dgid, sizeof(gid))) {
cma_dev = cur_dev;
sgid = gid;
@@ -488,7 +600,8 @@ static int cma_disable_callback(struct rdma_id_private *id_priv,
return 0;
}
-struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+struct rdma_cm_id *rdma_create_id(struct net *net,
+ rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
enum ib_qp_type qp_type)
{
@@ -512,6 +625,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+ id_priv->id.route.addr.dev_addr.net = get_net(net);
return &id_priv->id;
}
@@ -629,19 +743,12 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
goto out;
ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
- qp_attr.ah_attr.grh.sgid_index, &sgid);
+ qp_attr.ah_attr.grh.sgid_index, &sgid, NULL);
if (ret)
goto out;
- if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
- == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
- == IB_LINK_LAYER_ETHERNET) {
- ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
+ BUG_ON(id_priv->cma_dev->device != id_priv->id.device);
- if (ret)
- goto out;
- }
if (conn_param)
qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
@@ -700,11 +807,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
int ret;
u16 pkey;
- if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
- IB_LINK_LAYER_INFINIBAND)
- pkey = ib_addr_get_pkey(dev_addr);
- else
+ if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
pkey = 0xffff;
+ else
+ pkey = ib_addr_get_pkey(dev_addr);
ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
pkey, &qp_attr->pkey_index);
@@ -735,8 +841,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
int ret = 0;
id_priv = container_of(id, struct rdma_id_private, id);
- switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
else
@@ -745,19 +850,15 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
if (qp_attr->qp_state == IB_QPS_RTR)
qp_attr->rq_psn = id_priv->seq_num;
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
if (!id_priv->cm_id.iw) {
qp_attr->qp_access_flags = 0;
*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
} else
ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
qp_attr_mask);
- break;
- default:
+ } else
ret = -ENOSYS;
- break;
- }
return ret;
}
@@ -837,107 +938,419 @@ static inline int cma_any_port(struct sockaddr *addr)
return !cma_port(addr);
}
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
struct ib_sa_path_rec *path)
{
struct sockaddr_ib *listen_ib, *ib;
listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
- ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
- ib->sib_family = listen_ib->sib_family;
- if (path) {
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->sgid, 16);
- } else {
- ib->sib_pkey = listen_ib->sib_pkey;
- ib->sib_flowinfo = listen_ib->sib_flowinfo;
- ib->sib_addr = listen_ib->sib_addr;
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
}
- ib->sib_sid = listen_ib->sib_sid;
- ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
- ib->sib_scope_id = listen_ib->sib_scope_id;
-
- if (path) {
- ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
- ib->sib_family = listen_ib->sib_family;
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->dgid, 16);
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
}
}
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
- if (ss->ss_family == AF_INET)
- return ((struct sockaddr_in *)ss)->sin_port;
- else if (ss->ss_family == AF_INET6)
- return ((struct sockaddr_in6 *)ss)->sin6_port;
- BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
- ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip4 = (struct sockaddr_in *)src_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+ ip4->sin_port = local_port;
+ }
- ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
- ip4->sin_port = hdr->port;
+ if (dst_addr) {
+ ip4 = (struct sockaddr_in *)dst_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+ ip4->sin_port = hdr->port;
+ }
}
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->dst_addr.ip6;
- ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip6 = (struct sockaddr_in6 *)src_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->dst_addr.ip6;
+ ip6->sin6_port = local_port;
+ }
- ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->src_addr.ip6;
- ip6->sin6_port = hdr->port;
+ if (dst_addr) {
+ ip6 = (struct sockaddr_in6 *)dst_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->src_addr.ip6;
+ ip6->sin6_port = hdr->port;
+ }
}
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
{
- struct cma_hdr *hdr;
+ return (u16)be64_to_cpu(service_id);
+}
- if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
- if (ib_event->event == IB_CM_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
- else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, NULL);
- return 0;
- }
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
hdr = ib_event->private_data;
if (hdr->cma_version != CMA_VERSION)
return -EINVAL;
+ port = htons(cma_port_from_service_id(service_id));
+
switch (cma_get_ip_ver(hdr)) {
case 4:
- cma_save_ip4_info(id, listen_id, hdr);
+ cma_save_ip4_info(src_addr, dst_addr, hdr, port);
break;
case 6:
- cma_save_ip6_info(id, listen_id, hdr);
+ cma_save_ip6_info(src_addr, dst_addr, hdr, port);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = be16_to_cpu(req_param->primary_path->pkey);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->pkey;
break;
default:
return -EINVAL;
}
+
return 0;
}
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ int err;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_iif = net_dev->ifindex;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ rcu_read_lock();
+ err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+ ret = err == 0 && FIB_RES_DEV(res) == net_dev;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+ &src_addr->sin6_addr, net_dev->ifindex,
+ strict);
+ bool ret;
+
+ if (!rt)
+ return false;
+
+ ret = rt->rt6i_idev->dev == net_dev;
+ ip6_rt_put(rt);
+
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+ const struct cma_req_info *req)
+{
+ struct sockaddr_storage listen_addr_storage, src_addr_storage;
+ struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+ *src_addr = (struct sockaddr *)&src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+ dev_put(net_dev);
+ return ERR_PTR(-EHOSTUNREACH);
+ }
+
+ return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num)
+{
+ enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num);
+ enum rdma_transport_type transport =
+ rdma_node_get_transport(device->node_type);
+
+ return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+ struct ib_device *device = id->device;
+ const int port_num = id->port_num ?: rdma_start_port(device);
+
+ return cma_protocol_roce_dev_port(device, port_num);
+}
+
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+ const struct net_device *net_dev,
+ u8 port_num)
+{
+ const struct rdma_addr *addr = &id->route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request or a RoCE request */
+ return (!id->port_num || id->port_num == port_num) &&
+ (addr->src_addr.ss_family == AF_IB ||
+ cma_protocol_roce_dev_port(id->device, port_num));
+
+ return !addr->dev_addr.bound_dev_if ||
+ (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
+ addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv->id, net_dev, req->port))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event,
+ struct net_device **net_dev)
+{
+ struct cma_req_info req;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, &req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else if (cma_protocol_roce_dev_port(req.device, req.port)) {
+ /* TODO find the net dev matching the request parameters
+ * through the RoCE GID table */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
+ rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+ if (IS_ERR(id_priv) && *net_dev) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
+ }
+
+ return id_priv;
+}
+
static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
{
return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -945,13 +1358,9 @@ static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
static void cma_cancel_route(struct rdma_id_private *id_priv)
{
- switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
+ if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
if (id_priv->query)
ib_sa_cancel_query(id_priv->query_id, id_priv->query);
- break;
- default:
- break;
}
}
@@ -1002,6 +1411,7 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
static void cma_release_port(struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list = id_priv->bind_list;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
if (!bind_list)
return;
@@ -1009,7 +1419,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
mutex_lock(&lock);
hlist_del(&id_priv->node);
if (hlist_empty(&bind_list->owners)) {
- idr_remove(bind_list->ps, bind_list->port);
+ cma_ps_remove(net, bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -1023,17 +1433,12 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
mc = container_of(id_priv->mc_list.next,
struct cma_multicast, list);
list_del(&mc->list);
- switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
+ if (rdma_cap_ib_mcast(id_priv->cma_dev->device,
+ id_priv->id.port_num)) {
ib_sa_free_multicast(mc->multicast.ib);
kfree(mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
+ } else
kref_put(&mc->mcref, release_mc);
- break;
- default:
- break;
- }
}
}
@@ -1054,17 +1459,12 @@ void rdma_destroy_id(struct rdma_cm_id *id)
mutex_unlock(&id_priv->handler_mutex);
if (id_priv->cma_dev) {
- switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.ib)
ib_destroy_cm_id(id_priv->cm_id.ib);
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.iw)
iw_destroy_cm_id(id_priv->cm_id.iw);
- break;
- default:
- break;
}
cma_leave_mc_groups(id_priv);
cma_release_dev(id_priv);
@@ -1078,6 +1478,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
cma_deref_id(id_priv->id.context);
kfree(id_priv->id.route.path_rec);
+ put_net(id_priv->id.route.addr.dev_addr.net);
kfree(id_priv);
}
EXPORT_SYMBOL(rdma_destroy_id);
@@ -1197,20 +1598,27 @@ out:
}
static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
int ret;
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(listen_id->route.addr.dev_addr.net,
+ listen_id->event_handler, listen_id->context,
listen_id->ps, ib_event->param.req_rcvd.qp_type);
if (IS_ERR(id))
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
goto err;
rt = &id->route;
@@ -1224,14 +1632,21 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (rt->num_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
- if (cma_any_addr(cma_src_addr(id_priv))) {
- rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
- rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
- } else {
- ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ if (!cma_protocol_roce(listen_id) &&
+ cma_any_addr(cma_src_addr(id_priv))) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ } else if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (ret)
+ goto err;
+ }
}
rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
@@ -1244,25 +1659,38 @@ err:
}
static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct net *net = listen_id->route.addr.dev_addr.net;
int ret;
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(net, listen_id->event_handler, listen_id->context,
listen_id->ps, IB_QPT_UD);
if (IS_ERR(id))
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
goto err;
- if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
- ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
}
id_priv->state = RDMA_CM_CONNECT;
@@ -1300,25 +1728,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
+ struct net_device *net_dev;
int offset, ret;
- listen_id = cm_id->context;
- if (!cma_check_req_qp_type(&listen_id->id, ib_event))
- return -EINVAL;
+ listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+ ret = -ECONNABORTED;
+ goto net_dev_put;
+ }
memset(&event, 0, sizeof event);
offset = cma_user_data_offset(listen_id);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
- conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
event.param.ud.private_data = ib_event->private_data + offset;
event.param.ud.private_data_len =
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
} else {
- conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
ib_event->private_data, offset);
}
@@ -1356,6 +1792,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
mutex_unlock(&conn_id->handler_mutex);
mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
+ if (net_dev)
+ dev_put(net_dev);
return 0;
err3:
@@ -1369,6 +1807,11 @@ err1:
mutex_unlock(&listen_id->handler_mutex);
if (conn_id)
rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
return ret;
}
@@ -1381,42 +1824,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
}
EXPORT_SYMBOL(rdma_get_service_id);
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
- struct ib_cm_compare_data *compare)
-{
- struct cma_hdr *cma_data, *cma_mask;
- __be32 ip4_addr;
- struct in6_addr ip6_addr;
-
- memset(compare, 0, sizeof *compare);
- cma_data = (void *) compare->data;
- cma_mask = (void *) compare->mask;
-
- switch (addr->sa_family) {
- case AF_INET:
- ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
- cma_set_ip_ver(cma_data, 4);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip4.addr = ip4_addr;
- cma_mask->dst_addr.ip4.addr = htonl(~0);
- }
- break;
- case AF_INET6:
- ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
- cma_set_ip_ver(cma_data, 6);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip6 = ip6_addr;
- memset(&cma_mask->dst_addr.ip6, 0xFF,
- sizeof cma_mask->dst_addr.ip6);
- }
- break;
- default:
- break;
- }
-}
-
static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
{
struct rdma_id_private *id_priv = iw_id->context;
@@ -1498,7 +1905,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
return -ECONNABORTED;
/* Create a new RDMA id for the new IW CM ID */
- new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+ listen_id->id.event_handler,
listen_id->id.context,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(new_cm_id)) {
@@ -1570,33 +1978,18 @@ out:
static int cma_ib_listen(struct rdma_id_private *id_priv)
{
- struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
struct ib_cm_id *id;
__be64 svc_id;
- int ret;
- id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
if (IS_ERR(id))
return PTR_ERR(id);
-
id_priv->cm_id.ib = id;
- addr = cma_src_addr(id_priv);
- svc_id = rdma_get_service_id(&id_priv->id, addr);
- if (cma_any_addr(addr) && !id_priv->afonly)
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
- else {
- cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
- }
-
- if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
- id_priv->cm_id.ib = NULL;
- }
-
- return ret;
+ return 0;
}
static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -1610,6 +2003,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
if (IS_ERR(id))
return PTR_ERR(id);
+ id->tos = id_priv->tos;
id_priv->cm_id.iw = id;
memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -1640,13 +2034,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
{
struct rdma_id_private *dev_id_priv;
struct rdma_cm_id *id;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
int ret;
- if (cma_family(id_priv) == AF_IB &&
- rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB)
+ if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
return;
- id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+ id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps,
id_priv->id.qp_type);
if (IS_ERR(id))
return;
@@ -1925,16 +2319,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
route->num_paths = 1;
- if (addr->dev_addr.bound_dev_if)
+ if (addr->dev_addr.bound_dev_if) {
ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+ route->path_rec->net = &init_net;
+ route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+ }
if (!ndev) {
ret = -ENODEV;
goto err2;
}
- route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
- memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len);
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
&route->path_rec->sgid);
@@ -1984,26 +2379,15 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
return -EINVAL;
atomic_inc(&id_priv->refcount);
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ret = cma_resolve_ib_route(id_priv, timeout_ms);
- break;
- case IB_LINK_LAYER_ETHERNET:
- ret = cma_resolve_iboe_route(id_priv);
- break;
- default:
- ret = -ENOSYS;
- }
- break;
- case RDMA_TRANSPORT_IWARP:
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ else if (rdma_protocol_roce(id->device, id->port_num))
+ ret = cma_resolve_iboe_route(id_priv);
+ else if (rdma_protocol_iwarp(id->device, id->port_num))
ret = cma_resolve_iw_route(id_priv, timeout_ms);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
+
if (ret)
goto err;
@@ -2045,7 +2429,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
mutex_lock(&lock);
list_for_each_entry(cur_dev, &dev_list, list) {
if (cma_family(id_priv) == AF_IB &&
- rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+ !rdma_cap_ib_cm(cur_dev->device, 1))
continue;
if (!cma_dev)
@@ -2068,7 +2452,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
p = 1;
port_found:
- ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+ ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL);
if (ret)
goto out;
@@ -2077,7 +2461,7 @@ port_found:
goto out;
id_priv->id.route.addr.dev_addr.dev_type =
- (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+ (rdma_protocol_ib(cma_dev->device, p)) ?
ARPHRD_INFINIBAND : ARPHRD_ETHER;
rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
@@ -2195,8 +2579,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
src_addr = (struct sockaddr *) &id->route.addr.src_addr;
src_addr->sa_family = dst_addr->sa_family;
if (dst_addr->sa_family == AF_INET6) {
- ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
- ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
} else if (dst_addr->sa_family == AF_IB) {
((struct sockaddr_ib *) src_addr)->sib_pkey =
((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2317,8 +2704,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
hlist_add_head(&id_priv->node, &bind_list->owners);
}
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
- unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
{
struct rdma_bind_list *bind_list;
int ret;
@@ -2327,7 +2714,8 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
if (!bind_list)
return -ENOMEM;
- ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+ ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+ snum);
if (ret < 0)
goto err;
@@ -2340,18 +2728,20 @@ err:
return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
static unsigned int last_used_port;
int low, high, remaining;
unsigned int rover;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
- inet_get_local_port_range(&init_net, &low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
rover = prandom_u32() % remaining + low;
retry:
if (last_used_port != rover &&
- !idr_find(ps, (unsigned short) rover)) {
+ !cma_ps_find(net, ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2406,7 +2796,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
return 0;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list;
unsigned short snum;
@@ -2416,7 +2807,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- bind_list = idr_find(ps, snum);
+ bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2439,25 +2830,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+ struct rdma_id_private *id_priv)
{
switch (id_priv->id.ps) {
case RDMA_PS_TCP:
- return &tcp_ps;
case RDMA_PS_UDP:
- return &udp_ps;
case RDMA_PS_IPOIB:
- return &ipoib_ps;
case RDMA_PS_IB:
- return &ib_ps;
+ return id_priv->id.ps;
default:
- return NULL;
+
+ return 0;
}
}
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
{
- struct idr *ps = NULL;
+ enum rdma_port_space ps = 0;
struct sockaddr_ib *sib;
u64 sid_ps, mask, sid;
@@ -2467,15 +2857,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
sid_ps = RDMA_IB_IP_PS_IB;
- ps = &ib_ps;
+ ps = RDMA_PS_IB;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
(sid == (RDMA_IB_IP_PS_TCP & mask))) {
sid_ps = RDMA_IB_IP_PS_TCP;
- ps = &tcp_ps;
+ ps = RDMA_PS_TCP;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
(sid == (RDMA_IB_IP_PS_UDP & mask))) {
sid_ps = RDMA_IB_IP_PS_UDP;
- ps = &udp_ps;
+ ps = RDMA_PS_UDP;
}
if (ps) {
@@ -2488,7 +2878,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
static int cma_get_port(struct rdma_id_private *id_priv)
{
- struct idr *ps;
+ enum rdma_port_space ps;
int ret;
if (cma_family(id_priv) != AF_IB)
@@ -2554,18 +2944,15 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
id_priv->backlog = backlog;
if (id->device) {
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, 1)) {
ret = cma_ib_listen(id_priv);
if (ret)
goto err;
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, 1)) {
ret = cma_iw_listen(id_priv, backlog);
if (ret)
goto err;
- break;
- default:
+ } else {
ret = -ENOSYS;
goto err;
}
@@ -2612,8 +2999,11 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
if (addr->sa_family == AF_INET)
id_priv->afonly = 1;
#if IS_ENABLED(CONFIG_IPV6)
- else if (addr->sa_family == AF_INET6)
- id_priv->afonly = init_net.ipv6.sysctl.bindv6only;
+ else if (addr->sa_family == AF_INET6) {
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ id_priv->afonly = net->ipv6.sysctl.bindv6only;
+ }
#endif
}
ret = cma_get_port(id_priv);
@@ -2857,6 +3247,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
+ cm_id->tos = id_priv->tos;
id_priv->cm_id.iw = cm_id;
memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
@@ -2901,20 +3292,15 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
id_priv->srq = conn_param->srq;
}
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD)
ret = cma_resolve_ib_udp(id_priv, conn_param);
else
ret = cma_connect_ib(id_priv, conn_param);
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
ret = cma_connect_iw(id_priv, conn_param);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
if (ret)
goto err;
@@ -3017,8 +3403,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
id_priv->srq = conn_param->srq;
}
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD) {
if (conn_param)
ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
@@ -3034,14 +3419,10 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
else
ret = cma_rep_recv(id_priv);
}
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
ret = cma_accept_iw(id_priv, conn_param);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
if (ret)
goto reject;
@@ -3085,8 +3466,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
if (!id_priv->cm_id.ib)
return -EINVAL;
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD)
ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
private_data, private_data_len);
@@ -3094,15 +3474,12 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
ret = ib_send_cm_rej(id_priv->cm_id.ib,
IB_CM_REJ_CONSUMER_DEFINED, NULL,
0, private_data, private_data_len);
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_reject(id_priv->cm_id.iw,
private_data, private_data_len);
- break;
- default:
+ } else
ret = -ENOSYS;
- break;
- }
+
return ret;
}
EXPORT_SYMBOL(rdma_reject);
@@ -3116,22 +3493,18 @@ int rdma_disconnect(struct rdma_cm_id *id)
if (!id_priv->cm_id.ib)
return -EINVAL;
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
ret = cma_modify_qp_err(id_priv);
if (ret)
goto out;
/* Initiate or respond to a disconnect. */
if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
- break;
- case RDMA_TRANSPORT_IWARP:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
- break;
- default:
+ } else
ret = -EINVAL;
- break;
- }
+
out:
return ret;
}
@@ -3377,24 +3750,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
list_add(&mc->list, &id_priv->mc_list);
spin_unlock(&id_priv->lock);
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ret = cma_join_ib_multicast(id_priv, mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
- kref_init(&mc->mcref);
- ret = cma_iboe_join_multicast(id_priv, mc);
- break;
- default:
- ret = -EINVAL;
- }
- break;
- default:
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ kref_init(&mc->mcref);
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ } else if (rdma_cap_ib_mcast(id->device, id->port_num))
+ ret = cma_join_ib_multicast(id_priv, mc);
+ else
ret = -ENOSYS;
- break;
- }
if (ret) {
spin_lock_irq(&id_priv->lock);
@@ -3422,19 +3784,15 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
ib_detach_mcast(id->qp,
&mc->multicast.ib->rec.mgid,
be16_to_cpu(mc->multicast.ib->rec.mlid));
- if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ib_sa_free_multicast(mc->multicast.ib);
- kfree(mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
- kref_put(&mc->mcref, release_mc);
- break;
- default:
- break;
- }
- }
+
+ BUG_ON(id_priv->cma_dev->device != id->device);
+
+ if (rdma_cap_ib_mcast(id->device, id->port_num)) {
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ } else if (rdma_protocol_roce(id->device, id->port_num))
+ kref_put(&mc->mcref, release_mc);
+
return;
}
}
@@ -3450,6 +3808,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id
dev_addr = &id_priv->id.route.addr.dev_addr;
if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ (net_eq(dev_net(ndev), dev_addr->net)) &&
memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
ndev->name, &id_priv->id);
@@ -3475,9 +3834,6 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
struct rdma_id_private *id_priv;
int ret = NOTIFY_DONE;
- if (dev_net(ndev) != &init_net)
- return NOTIFY_DONE;
-
if (event != NETDEV_BONDING_FAILOVER)
return NOTIFY_DONE;
@@ -3578,11 +3934,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
wait_for_completion(&cma_dev->comp);
}
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
{
- struct cma_device *cma_dev;
+ struct cma_device *cma_dev = client_data;
- cma_dev = ib_get_client_data(device, &cma_client);
if (!cma_dev)
return;
@@ -3673,6 +4028,35 @@ static const struct ibnl_client_cbs cma_cb_table[] = {
.module = THIS_MODULE },
};
+static int cma_init_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_init(&pernet->tcp_ps);
+ idr_init(&pernet->udp_ps);
+ idr_init(&pernet->ipoib_ps);
+ idr_init(&pernet->ib_ps);
+
+ return 0;
+}
+
+static void cma_exit_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_destroy(&pernet->tcp_ps);
+ idr_destroy(&pernet->udp_ps);
+ idr_destroy(&pernet->ipoib_ps);
+ idr_destroy(&pernet->ib_ps);
+}
+
+static struct pernet_operations cma_pernet_operations = {
+ .init = cma_init_net,
+ .exit = cma_exit_net,
+ .id = &cma_pernet_id,
+ .size = sizeof(struct cma_pernet),
+};
+
static int __init cma_init(void)
{
int ret;
@@ -3681,6 +4065,10 @@ static int __init cma_init(void)
if (!cma_wq)
return -ENOMEM;
+ ret = register_pernet_subsys(&cma_pernet_operations);
+ if (ret)
+ goto err_wq;
+
ib_sa_register_client(&sa_client);
rdma_addr_register_client(&addr_client);
register_netdevice_notifier(&cma_nb);
@@ -3698,6 +4086,7 @@ err:
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+err_wq:
destroy_workqueue(cma_wq);
return ret;
}
@@ -3709,11 +4098,8 @@ static void __exit cma_cleanup(void)
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
destroy_workqueue(cma_wq);
- idr_destroy(&tcp_ps);
- idr_destroy(&udp_ps);
- idr_destroy(&ipoib_ps);
- idr_destroy(&ib_ps);
}
module_init(cma_init);
diff --git a/kernel/drivers/infiniband/core/core_priv.h b/kernel/drivers/infiniband/core/core_priv.h
index 87d1936f5..5cf6eb716 100644
--- a/kernel/drivers/infiniband/core/core_priv.h
+++ b/kernel/drivers/infiniband/core/core_priv.h
@@ -43,12 +43,53 @@ int ib_device_register_sysfs(struct ib_device *device,
u8, struct kobject *));
void ib_device_unregister_sysfs(struct ib_device *device);
-int ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int ib_cache_setup(void);
+void ib_cache_setup(void);
void ib_cache_cleanup(void);
-int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
- struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
#endif /* _CORE_PRIV_H */
diff --git a/kernel/drivers/infiniband/core/device.c b/kernel/drivers/infiniband/core/device.c
index 18c1ece76..179e8134d 100644
--- a/kernel/drivers/infiniband/core/device.c
+++ b/kernel/drivers/infiniband/core/device.c
@@ -38,7 +38,10 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include "core_priv.h"
@@ -50,22 +53,34 @@ struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
+ /* The device or client is going down. Do not call client or device
+ * callbacks other than remove(). */
+ bool going_down;
};
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
/*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list. In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list. device_mutex protects writer access by device and client
+ * registration / de-registration. lists_rwsem protects reader access to
+ * these lists. Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
*/
static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
static int ib_device_check_mandatory(struct ib_device *device)
{
@@ -92,7 +107,8 @@ static int ib_device_check_mandatory(struct ib_device *device)
IB_MANDATORY_FUNC(poll_cq),
IB_MANDATORY_FUNC(req_notify_cq),
IB_MANDATORY_FUNC(get_dma_mr),
- IB_MANDATORY_FUNC(dereg_mr)
+ IB_MANDATORY_FUNC(dereg_mr),
+ IB_MANDATORY_FUNC(get_port_immutable)
};
int i;
@@ -151,18 +167,36 @@ static int alloc_name(char *name)
return 0;
}
-static int start_port(struct ib_device *device)
+static void ib_device_release(struct device *device)
{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
-}
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ ib_cache_release_one(dev);
+ kfree(dev->port_immutable);
+ kfree(dev);
+}
-static int end_port(struct ib_device *device)
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ?
- 0 : device->phys_port_cnt;
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
}
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+};
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -175,9 +209,27 @@ static int end_port(struct ib_device *device)
*/
struct ib_device *ib_alloc_device(size_t size)
{
- BUG_ON(size < sizeof (struct ib_device));
+ struct ib_device *device;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
+
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
+
+ device->dev.class = &ib_class;
+ device_initialize(&device->dev);
+
+ dev_set_drvdata(&device->dev, device);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ INIT_LIST_HEAD(&device->client_data_list);
+ INIT_LIST_HEAD(&device->port_list);
- return kzalloc(size, GFP_KERNEL);
+ return device;
}
EXPORT_SYMBOL(ib_alloc_device);
@@ -189,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- if (device->reg_state == IB_DEV_UNINITIALIZED) {
- kfree(device);
- return;
- }
-
- BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+ WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+ device->reg_state != IB_DEV_UNINITIALIZED);
kobject_put(&device->dev.kobj);
}
EXPORT_SYMBOL(ib_dealloc_device);
@@ -214,51 +261,53 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
context->client = client;
context->data = NULL;
+ context->going_down = false;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_add(&context->list, &device->client_data_list);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
return 0;
}
-static int read_port_table_lengths(struct ib_device *device)
+static int verify_immutable(const struct ib_device *dev, u8 port)
{
- struct ib_port_attr *tprops = NULL;
- int num_ports, ret = -ENOMEM;
- u8 port_index;
-
- tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
- if (!tprops)
- goto out;
-
- num_ports = end_port(device) - start_port(device) + 1;
+ return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+ rdma_max_mad_size(dev, port) != 0);
+}
- device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
- GFP_KERNEL);
- device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
- GFP_KERNEL);
- if (!device->pkey_tbl_len || !device->gid_tbl_len)
- goto err;
+static int read_port_immutable(struct ib_device *device)
+{
+ int ret;
+ u8 start_port = rdma_start_port(device);
+ u8 end_port = rdma_end_port(device);
+ u8 port;
+
+ /**
+ * device->port_immutable is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_immutable is declared as a 1 based array with
+ * potential empty slots at the beginning.
+ */
+ device->port_immutable = kzalloc(sizeof(*device->port_immutable)
+ * (end_port + 1),
+ GFP_KERNEL);
+ if (!device->port_immutable)
+ return -ENOMEM;
- for (port_index = 0; port_index < num_ports; ++port_index) {
- ret = ib_query_port(device, port_index + start_port(device),
- tprops);
+ for (port = start_port; port <= end_port; ++port) {
+ ret = device->get_port_immutable(device, port,
+ &device->port_immutable[port]);
if (ret)
- goto err;
- device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
- device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
- }
-
- ret = 0;
- goto out;
+ return ret;
-err:
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
-out:
- kfree(tprops);
- return ret;
+ if (verify_immutable(device, port))
+ return -EINVAL;
+ }
+ return 0;
}
/**
@@ -275,6 +324,7 @@ int ib_register_device(struct ib_device *device,
u8, struct kobject *))
{
int ret;
+ struct ib_client *client;
mutex_lock(&device_mutex);
@@ -289,40 +339,37 @@ int ib_register_device(struct ib_device *device,
goto out;
}
- INIT_LIST_HEAD(&device->event_handler_list);
- INIT_LIST_HEAD(&device->client_data_list);
- spin_lock_init(&device->event_handler_lock);
- spin_lock_init(&device->client_data_lock);
-
- ret = read_port_table_lengths(device);
+ ret = read_port_immutable(device);
if (ret) {
- printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+ printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
device->name);
goto out;
}
+ ret = ib_cache_setup_one(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device, port_callback);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
device->name);
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
+ ib_cache_cleanup_one(device);
goto out;
}
- list_add_tail(&device->core_list, &device_list);
-
device->reg_state = IB_DEV_REGISTERED;
- {
- struct ib_client *client;
-
- list_for_each_entry(client, &client_list, list)
- if (client->add && !add_client_context(device, client))
- client->add(device);
- }
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
- out:
+ down_write(&lists_rwsem);
+ list_add_tail(&device->core_list, &device_list);
+ up_write(&lists_rwsem);
+out:
mutex_unlock(&device_mutex);
return ret;
}
@@ -336,29 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
*/
void ib_unregister_device(struct ib_device *device)
{
- struct ib_client *client;
struct ib_client_data *context, *tmp;
unsigned long flags;
mutex_lock(&device_mutex);
- list_for_each_entry_reverse(client, &client_list, list)
- if (client->remove)
- client->remove(device);
-
+ down_write(&lists_rwsem);
list_del(&device->core_list);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ context->going_down = true;
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ downgrade_write(&lists_rwsem);
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list,
+ list) {
+ if (context->client->remove)
+ context->client->remove(device, context->data);
+ }
+ up_read(&lists_rwsem);
mutex_unlock(&device_mutex);
ib_device_unregister_sysfs(device);
+ ib_cache_cleanup_one(device);
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
kfree(context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
device->reg_state = IB_DEV_UNREGISTERED;
}
@@ -383,11 +438,14 @@ int ib_register_client(struct ib_client *client)
mutex_lock(&device_mutex);
- list_add_tail(&client->list, &client_list);
list_for_each_entry(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
+ down_write(&lists_rwsem);
+ list_add_tail(&client->list, &client_list);
+ up_write(&lists_rwsem);
+
mutex_unlock(&device_mutex);
return 0;
@@ -410,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
mutex_lock(&device_mutex);
+ down_write(&lists_rwsem);
+ list_del(&client->list);
+ up_write(&lists_rwsem);
+
list_for_each_entry(device, &device_list, core_list) {
- if (client->remove)
- client->remove(device);
+ struct ib_client_data *found_context = NULL;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
- list_del(&context->list);
- kfree(context);
+ context->going_down = true;
+ found_context = context;
+ break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ if (client->remove)
+ client->remove(device, found_context ?
+ found_context->data : NULL);
+
+ if (!found_context) {
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+ continue;
+ }
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_del(&found_context->list);
+ kfree(found_context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
}
- list_del(&client->list);
mutex_unlock(&device_mutex);
}
@@ -558,7 +638,11 @@ EXPORT_SYMBOL(ib_dispatch_event);
int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
{
- return device->query_device(device, device_attr);
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+
+ memset(device_attr, 0, sizeof(*device_attr));
+
+ return device->query_device(device, device_attr, &uhw);
}
EXPORT_SYMBOL(ib_query_device);
@@ -575,7 +659,7 @@ int ib_query_port(struct ib_device *device,
u8 port_num,
struct ib_port_attr *port_attr)
{
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
return device->query_port(device, port_num, port_attr);
@@ -588,17 +672,92 @@ EXPORT_SYMBOL(ib_query_port);
* @port_num:Port number to query
* @index:GID table index to query
* @gid:Returned GID
+ * @attr: Returned GID attributes related to this GID index (only in RoCE).
+ * NULL means ignore.
*
* ib_query_gid() fetches the specified GID table entry.
*/
int ib_query_gid(struct ib_device *device,
- u8 port_num, int index, union ib_gid *gid)
+ u8 port_num, int index, union ib_gid *gid,
+ struct ib_gid_attr *attr)
{
+ if (rdma_cap_roce_gid_table(device, port_num))
+ return ib_get_cached_gid(device, port_num, index, gid, attr);
+
+ if (attr)
+ return -EINVAL;
+
return device->query_gid(device, port_num, index, gid);
}
EXPORT_SYMBOL(ib_query_gid);
/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u8 port;
+
+ for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+ port++)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev = NULL;
+
+ if (ib_dev->get_netdev)
+ idev = ib_dev->get_netdev(ib_dev, port);
+
+ if (idev &&
+ idev->reg_state >= NETREG_UNREGISTERED) {
+ dev_put(idev);
+ idev = NULL;
+ }
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&lists_rwsem);
+}
+
+/**
* ib_query_pkey - Get P_Key table entry
* @device:Device to query
* @port_num:Port number to query
@@ -653,7 +812,7 @@ int ib_modify_port(struct ib_device *device,
if (!device->modify_port)
return -ENOSYS;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
return device->modify_port(device, port_num, port_modify_mask,
@@ -666,19 +825,28 @@ EXPORT_SYMBOL(ib_modify_port);
* a specified GID value occurs.
* @device: The device to query.
* @gid: The GID value to search for.
+ * @ndev: The ndev related to the GID to search for.
* @port_num: The port number of the device where the GID value was found.
* @index: The index into the GID table where the GID was found. This
* parameter may be NULL.
*/
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
- u8 *port_num, u16 *index)
+ struct net_device *ndev, u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
int ret, port, i;
- for (port = start_port(device); port <= end_port(device); ++port) {
- for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
- ret = ib_query_gid(device, port, i, &tmp_gid);
+ for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ if (rdma_cap_roce_gid_table(device, port)) {
+ if (!ib_find_cached_gid_by_port(device, gid, port,
+ ndev, index)) {
+ *port_num = port;
+ return 0;
+ }
+ }
+
+ for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
if (ret)
return ret;
if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
@@ -709,7 +877,7 @@ int ib_find_pkey(struct ib_device *device,
u16 tmp_pkey;
int partial_ix = -1;
- for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
@@ -733,6 +901,51 @@ int ib_find_pkey(struct ib_device *device,
}
EXPORT_SYMBOL(ib_find_pkey);
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ struct ib_client_data *context;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ down_read(&lists_rwsem);
+
+ list_for_each_entry(context, &dev->client_data_list, list) {
+ struct ib_client *client = context->client;
+
+ if (context->going_down)
+ continue;
+
+ if (client->get_net_dev_by_params) {
+ net_dev = client->get_net_dev_by_params(dev, port, pkey,
+ gid, addr,
+ context->data);
+ if (net_dev)
+ break;
+ }
+ }
+
+ up_read(&lists_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
static int __init ib_core_init(void)
{
int ret;
@@ -741,7 +954,7 @@ static int __init ib_core_init(void)
if (!ib_wq)
return -ENOMEM;
- ret = ib_sysfs_setup();
+ ret = class_register(&ib_class);
if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
goto err;
@@ -753,19 +966,12 @@ static int __init ib_core_init(void)
goto err_sysfs;
}
- ret = ib_cache_setup();
- if (ret) {
- printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
- goto err_nl;
- }
+ ib_cache_setup();
return 0;
-err_nl:
- ibnl_cleanup();
-
err_sysfs:
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
err:
destroy_workqueue(ib_wq);
@@ -776,7 +982,7 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ibnl_cleanup();
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
}
diff --git a/kernel/drivers/infiniband/core/iwpm_msg.c b/kernel/drivers/infiniband/core/iwpm_msg.c
index e6ffa2e66..22a3abee2 100644
--- a/kernel/drivers/infiniband/core/iwpm_msg.c
+++ b/kernel/drivers/infiniband/core/iwpm_msg.c
@@ -67,7 +67,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
err_str = "Invalid port mapper client";
goto pid_query_error;
}
- if (iwpm_registered_client(nl_client))
+ if (iwpm_check_registration(nl_client, IWPM_REG_VALID) ||
+ iwpm_user_pid == IWPM_PID_UNAVAILABLE)
return 0;
skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
if (!skb) {
@@ -106,7 +107,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
if (ret) {
skb = NULL; /* skb is freed in the netlink send-op handling */
- iwpm_set_registered(nl_client, 1);
iwpm_user_pid = IWPM_PID_UNAVAILABLE;
err_str = "Unable to send a nlmsg";
goto pid_query_error;
@@ -144,12 +144,12 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
err_str = "Invalid port mapper client";
goto add_mapping_error;
}
- if (!iwpm_registered_client(nl_client)) {
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
err_str = "Unregistered port mapper client";
goto add_mapping_error;
}
- if (!iwpm_valid_pid())
- return 0;
skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
if (!skb) {
err_str = "Unable to create a nlmsg";
@@ -214,12 +214,12 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
err_str = "Invalid port mapper client";
goto query_mapping_error;
}
- if (!iwpm_registered_client(nl_client)) {
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
err_str = "Unregistered port mapper client";
goto query_mapping_error;
}
- if (!iwpm_valid_pid())
- return 0;
ret = -ENOMEM;
skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
if (!skb) {
@@ -288,12 +288,12 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
err_str = "Invalid port mapper client";
goto remove_mapping_error;
}
- if (!iwpm_registered_client(nl_client)) {
+ if (!iwpm_valid_pid())
+ return 0;
+ if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {
err_str = "Unregistered port mapper client";
goto remove_mapping_error;
}
- if (!iwpm_valid_pid())
- return 0;
skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
if (!skb) {
ret = -ENOMEM;
@@ -388,7 +388,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
__func__, iwpm_user_pid);
if (iwpm_valid_client(nl_client))
- iwpm_set_registered(nl_client, 1);
+ iwpm_set_registration(nl_client, IWPM_REG_VALID);
register_pid_response_exit:
nlmsg_request->request_done = 1;
/* always for found nlmsg_request */
@@ -644,7 +644,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
const char *msg_type = "Mapping Info response";
- int iwpm_pid;
u8 nl_client;
char *iwpm_name;
u16 iwpm_version;
@@ -669,14 +668,14 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
__func__, nl_client);
return ret;
}
- iwpm_set_registered(nl_client, 0);
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
if (!iwpm_mapinfo_available())
return 0;
- iwpm_pid = cb->nlh->nlmsg_pid;
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
- __func__, iwpm_pid);
- ret = iwpm_send_mapinfo(nl_client, iwpm_pid);
+ __func__, iwpm_user_pid);
+ ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
return ret;
}
EXPORT_SYMBOL(iwpm_mapping_info_cb);
diff --git a/kernel/drivers/infiniband/core/iwpm_util.c b/kernel/drivers/infiniband/core/iwpm_util.c
index a626795bf..5fb089e91 100644
--- a/kernel/drivers/infiniband/core/iwpm_util.c
+++ b/kernel/drivers/infiniband/core/iwpm_util.c
@@ -78,6 +78,7 @@ init_exit:
mutex_unlock(&iwpm_admin_lock);
if (!ret) {
iwpm_set_valid(nl_client, 1);
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
pr_debug("%s: Mapinfo and reminfo tables are created\n",
__func__);
}
@@ -106,6 +107,7 @@ int iwpm_exit(u8 nl_client)
}
mutex_unlock(&iwpm_admin_lock);
iwpm_set_valid(nl_client, 0);
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
return 0;
}
EXPORT_SYMBOL(iwpm_exit);
@@ -397,17 +399,23 @@ void iwpm_set_valid(u8 nl_client, int valid)
}
/* valid client */
-int iwpm_registered_client(u8 nl_client)
+u32 iwpm_get_registration(u8 nl_client)
{
return iwpm_admin.reg_list[nl_client];
}
/* valid client */
-void iwpm_set_registered(u8 nl_client, int reg)
+void iwpm_set_registration(u8 nl_client, u32 reg)
{
iwpm_admin.reg_list[nl_client] = reg;
}
+/* valid client */
+u32 iwpm_check_registration(u8 nl_client, u32 reg)
+{
+ return (iwpm_get_registration(nl_client) & reg);
+}
+
int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
struct sockaddr_storage *b_sockaddr)
{
diff --git a/kernel/drivers/infiniband/core/iwpm_util.h b/kernel/drivers/infiniband/core/iwpm_util.h
index ee2d9ff09..b7b9e194c 100644
--- a/kernel/drivers/infiniband/core/iwpm_util.h
+++ b/kernel/drivers/infiniband/core/iwpm_util.h
@@ -58,6 +58,10 @@
#define IWPM_PID_UNDEFINED -1
#define IWPM_PID_UNAVAILABLE -2
+#define IWPM_REG_UNDEF 0x01
+#define IWPM_REG_VALID 0x02
+#define IWPM_REG_INCOMPL 0x04
+
struct iwpm_nlmsg_request {
struct list_head inprocess_list;
__u32 nlmsg_seq;
@@ -88,7 +92,7 @@ struct iwpm_admin_data {
atomic_t refcount;
atomic_t nlmsg_seq;
int client_list[RDMA_NL_NUM_CLIENTS];
- int reg_list[RDMA_NL_NUM_CLIENTS];
+ u32 reg_list[RDMA_NL_NUM_CLIENTS];
};
/**
@@ -159,19 +163,31 @@ int iwpm_valid_client(u8 nl_client);
void iwpm_set_valid(u8 nl_client, int valid);
/**
- * iwpm_registered_client - Check if the port mapper client is registered
+ * iwpm_check_registration - Check if the client registration
+ * matches the given one
* @nl_client: The index of the netlink client
+ * @reg: The given registration type to compare with
*
* Call iwpm_register_pid() to register a client
+ * Returns true if the client registration matches reg,
+ * otherwise returns false
+ */
+u32 iwpm_check_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_set_registration - Set the client registration
+ * @nl_client: The index of the netlink client
+ * @reg: Registration type to set
*/
-int iwpm_registered_client(u8 nl_client);
+void iwpm_set_registration(u8 nl_client, u32 reg);
/**
- * iwpm_set_registered - Set the port mapper client to registered or not
+ * iwpm_get_registration
* @nl_client: The index of the netlink client
- * @reg: 1 if registered or 0 if not
+ *
+ * Returns the client registration type
*/
-void iwpm_set_registered(u8 nl_client, int reg);
+u32 iwpm_get_registration(u8 nl_client);
/**
* iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
diff --git a/kernel/drivers/infiniband/core/mad.c b/kernel/drivers/infiniband/core/mad.c
index 74c30f4c5..2281de122 100644
--- a/kernel/drivers/infiniband/core/mad.c
+++ b/kernel/drivers/infiniband/core/mad.c
@@ -3,6 +3,7 @@
* Copyright (c) 2005 Intel Corporation. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
* Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -44,6 +45,7 @@
#include "mad_priv.h"
#include "mad_rmpp.h"
#include "smi.h"
+#include "opa_smi.h"
#include "agent.h"
MODULE_LICENSE("Dual BSD/GPL");
@@ -59,8 +61,6 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
-static struct kmem_cache *ib_mad_cache;
-
static struct list_head ib_mad_port_list;
static u32 ib_mad_client_id = 0;
@@ -73,7 +73,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method,
static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
static struct ib_mad_agent_private *find_mad_agent(
struct ib_mad_port_private *port_priv,
- struct ib_mad *mad);
+ const struct ib_mad_hdr *mad);
static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad);
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
@@ -179,12 +179,12 @@ static int is_vendor_method_in_use(
return 0;
}
-int ib_response_mad(struct ib_mad *mad)
+int ib_response_mad(const struct ib_mad_hdr *hdr)
{
- return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
- (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
- ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
- (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+ return ((hdr->method & IB_MGMT_METHOD_RESP) ||
+ (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((hdr->mgmt_class == IB_MGMT_CLASS_BM) &&
+ (hdr->attr_mod & IB_BM_ATTR_MOD_RESP)));
}
EXPORT_SYMBOL(ib_response_mad);
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error1;
}
- mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(mad_agent_priv->agent.mr)) {
- ret = ERR_PTR(-ENOMEM);
- goto error2;
- }
-
if (mad_reg_req) {
reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
kfree(reg_req);
error3:
- ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
kfree(mad_agent_priv);
error1:
return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
wait_for_completion(&mad_agent_priv->comp);
kfree(mad_agent_priv->reg_req);
- ib_dereg_mr(mad_agent_priv->agent.mr);
kfree(mad_agent_priv);
}
@@ -717,6 +707,32 @@ static void build_smp_wc(struct ib_qp *qp,
wc->port_num = port_num;
}
+static size_t mad_priv_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_mad_private) + mp->mad_size;
+}
+
+static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags)
+{
+ size_t size = sizeof(struct ib_mad_private) + mad_size;
+ struct ib_mad_private *ret = kzalloc(size, flags);
+
+ if (ret)
+ ret->mad_size = mad_size;
+
+ return ret;
+}
+
+static size_t port_mad_size(const struct ib_mad_port_private *port_priv)
+{
+ return rdma_max_mad_size(port_priv->device, port_priv->port_num);
+}
+
+static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_grh) + mp->mad_size;
+}
+
/*
* Return 0 if SMP is to be sent
* Return 1 if SMP was consumed locally (whether or not solicited)
@@ -727,6 +743,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
{
int ret = 0;
struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ struct opa_smp *opa_smp = (struct opa_smp *)smp;
unsigned long flags;
struct ib_mad_local_private *local;
struct ib_mad_private *mad_priv;
@@ -735,11 +752,16 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
struct ib_device *device = mad_agent_priv->agent.device;
u8 port_num;
struct ib_wc mad_wc;
- struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH &&
+ struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
+ size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
+ u16 out_mad_pkey_index = 0;
+ u16 drslid;
+ bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ if (rdma_cap_ib_switch(device) &&
smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
- port_num = send_wr->wr.ud.port_num;
+ port_num = send_wr->port_num;
else
port_num = mad_agent_priv->agent.port_num;
@@ -749,19 +771,49 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
* If we are at the start of the LID routed part, don't update the
* hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
*/
- if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
- IB_LID_PERMISSIVE &&
- smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
- IB_SMI_DISCARD) {
- ret = -EINVAL;
- dev_err(&device->dev, "Invalid directed route\n");
- goto out;
- }
+ if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) {
+ u32 opa_drslid;
+
+ if ((opa_get_smp_direction(opa_smp)
+ ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+ OPA_LID_PERMISSIVE &&
+ opa_smi_handle_dr_smp_send(opa_smp,
+ rdma_cap_ib_switch(device),
+ port_num) == IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid directed route\n");
+ goto out;
+ }
+ opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+ if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
+ opa_drslid & 0xffff0000) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+ opa_drslid);
+ goto out;
+ }
+ drslid = (u16)(opa_drslid & 0x0000ffff);
- /* Check to post send on QP or process locally */
- if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
- smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
- goto out;
+ /* Check to post send on QP or process locally */
+ if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+ opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+ goto out;
+ } else {
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+ IB_LID_PERMISSIVE &&
+ smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "Invalid directed route\n");
+ goto out;
+ }
+ drslid = be16_to_cpu(smp->dr_slid);
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+ }
local = kmalloc(sizeof *local, GFP_ATOMIC);
if (!local) {
@@ -771,7 +823,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
}
local->mad_priv = NULL;
local->recv_mad_agent = NULL;
- mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+ mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC);
if (!mad_priv) {
ret = -ENOMEM;
dev_err(&device->dev, "No memory for local response MAD\n");
@@ -780,18 +832,25 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
}
build_smp_wc(mad_agent_priv->agent.qp,
- send_wr->wr_id, be16_to_cpu(smp->dr_slid),
- send_wr->wr.ud.pkey_index,
- send_wr->wr.ud.port_num, &mad_wc);
+ send_wr->wr.wr_id, drslid,
+ send_wr->pkey_index,
+ send_wr->port_num, &mad_wc);
+
+ if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+ mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+ + mad_send_wr->send_buf.data_len
+ + sizeof(struct ib_grh);
+ }
/* No GRH for DR SMP */
ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
- (struct ib_mad *)smp,
- (struct ib_mad *)&mad_priv->mad);
+ (const struct ib_mad_hdr *)smp, mad_size,
+ (struct ib_mad_hdr *)mad_priv->mad,
+ &mad_size, &out_mad_pkey_index);
switch (ret)
{
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
- if (ib_response_mad(&mad_priv->mad.mad) &&
+ if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
mad_agent_priv->agent.recv_handler) {
local->mad_priv = mad_priv;
local->recv_mad_agent = mad_agent_priv;
@@ -801,39 +860,43 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
*/
atomic_inc(&mad_agent_priv->refcount);
} else
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
case IB_MAD_RESULT_SUCCESS:
/* Treat like an incoming receive MAD */
port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
mad_agent_priv->agent.port_num);
if (port_priv) {
- memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+ memcpy(mad_priv->mad, smp, mad_priv->mad_size);
recv_mad_agent = find_mad_agent(port_priv,
- &mad_priv->mad.mad);
+ (const struct ib_mad_hdr *)mad_priv->mad);
}
if (!port_priv || !recv_mad_agent) {
/*
* No receiving agent so drop packet and
* generate send completion.
*/
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
}
local->mad_priv = mad_priv;
local->recv_mad_agent = recv_mad_agent;
break;
default:
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
kfree(local);
ret = -EINVAL;
goto out;
}
local->mad_send_wr = mad_send_wr;
+ if (opa) {
+ local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
+ local->return_wc_byte_len = mad_size;
+ }
/* Reference MAD agent until send side of local completion handled */
atomic_inc(&mad_agent_priv->refcount);
/* Queue local completion to local list */
@@ -847,11 +910,11 @@ out:
return ret;
}
-static int get_pad_size(int hdr_len, int data_len)
+static int get_pad_size(int hdr_len, int data_len, size_t mad_size)
{
int seg_size, pad;
- seg_size = sizeof(struct ib_mad) - hdr_len;
+ seg_size = mad_size - hdr_len;
if (data_len && seg_size) {
pad = seg_size - data_len % seg_size;
return pad == seg_size ? 0 : pad;
@@ -870,14 +933,15 @@ static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
}
static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
- gfp_t gfp_mask)
+ size_t mad_size, gfp_t gfp_mask)
{
struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
struct ib_rmpp_segment *seg = NULL;
int left, seg_size, pad;
- send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+ send_buf->seg_size = mad_size - send_buf->hdr_len;
+ send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR;
seg_size = send_buf->seg_size;
pad = send_wr->pad;
@@ -910,7 +974,7 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
return 0;
}
-int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent)
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
{
return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
}
@@ -920,26 +984,37 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
u32 remote_qpn, u16 pkey_index,
int rmpp_active,
int hdr_len, int data_len,
- gfp_t gfp_mask)
+ gfp_t gfp_mask,
+ u8 base_version)
{
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
int pad, message_size, ret, size;
void *buf;
+ size_t mad_size;
+ bool opa;
mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
agent);
- pad = get_pad_size(hdr_len, data_len);
+
+ opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num);
+
+ if (opa && base_version == OPA_MGMT_BASE_VERSION)
+ mad_size = sizeof(struct opa_mad);
+ else
+ mad_size = sizeof(struct ib_mad);
+
+ pad = get_pad_size(hdr_len, data_len, mad_size);
message_size = hdr_len + data_len + pad;
if (ib_mad_kernel_rmpp_agent(mad_agent)) {
- if (!rmpp_active && message_size > sizeof(struct ib_mad))
+ if (!rmpp_active && message_size > mad_size)
return ERR_PTR(-EINVAL);
} else
- if (rmpp_active || message_size > sizeof(struct ib_mad))
+ if (rmpp_active || message_size > mad_size)
return ERR_PTR(-EINVAL);
- size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+ size = rmpp_active ? hdr_len : mad_size;
buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
if (!buf)
return ERR_PTR(-ENOMEM);
@@ -953,21 +1028,28 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
mad_send_wr->mad_agent_priv = mad_agent_priv;
mad_send_wr->sg_list[0].length = hdr_len;
- mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
- mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
- mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
-
- mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
- mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
- mad_send_wr->send_wr.num_sge = 2;
- mad_send_wr->send_wr.opcode = IB_WR_SEND;
- mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
- mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
- mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
- mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ /* OPA MADs don't have to be the full 2048 bytes */
+ if (opa && base_version == OPA_MGMT_BASE_VERSION &&
+ data_len < mad_size - hdr_len)
+ mad_send_wr->sg_list[1].length = data_len;
+ else
+ mad_send_wr->sg_list[1].length = mad_size - hdr_len;
+
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr;
+ mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.wr.num_sge = 2;
+ mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.pkey_index = pkey_index;
if (rmpp_active) {
- ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+ ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask);
if (ret) {
kfree(buf);
return ERR_PTR(ret);
@@ -1069,7 +1151,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
/* Set WR ID to find mad_send_wr upon completion */
qp_info = mad_send_wr->mad_agent_priv->qp_info;
- mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+ mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
mad_agent = mad_send_wr->send_buf.mad_agent;
@@ -1097,7 +1179,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
spin_lock_irqsave(&qp_info->send_queue.lock, flags);
if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
- ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
&bad_send_wr);
list = &qp_info->send_queue.list;
} else {
@@ -1162,7 +1244,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
* request associated with the completion
*/
next_send_buf = send_buf->next;
- mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+ mad_send_wr->send_wr.ah = send_buf->ah;
if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
@@ -1237,7 +1319,7 @@ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
recv_wc);
priv = container_of(mad_priv_hdr, struct ib_mad_private,
header);
- kmem_cache_free(ib_mad_cache, priv);
+ kfree(priv);
}
}
EXPORT_SYMBOL(ib_free_recv_mad);
@@ -1324,7 +1406,7 @@ static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
}
static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
- char *oui)
+ const char *oui)
{
int i;
@@ -1622,13 +1704,13 @@ out:
static struct ib_mad_agent_private *
find_mad_agent(struct ib_mad_port_private *port_priv,
- struct ib_mad *mad)
+ const struct ib_mad_hdr *mad_hdr)
{
struct ib_mad_agent_private *mad_agent = NULL;
unsigned long flags;
spin_lock_irqsave(&port_priv->reg_lock, flags);
- if (ib_response_mad(mad)) {
+ if (ib_response_mad(mad_hdr)) {
u32 hi_tid;
struct ib_mad_agent_private *entry;
@@ -1636,7 +1718,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
* Routing is based on high 32 bits of transaction ID
* of MAD.
*/
- hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
if (entry->agent.hi_tid == hi_tid) {
mad_agent = entry;
@@ -1648,45 +1730,45 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
struct ib_mad_mgmt_method_table *method;
struct ib_mad_mgmt_vendor_class_table *vendor;
struct ib_mad_mgmt_vendor_class *vendor_class;
- struct ib_vendor_mad *vendor_mad;
+ const struct ib_vendor_mad *vendor_mad;
int index;
/*
* Routing is based on version, class, and method
* For "newer" vendor MADs, also based on OUI
*/
- if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+ if (mad_hdr->class_version >= MAX_MGMT_VERSION)
goto out;
- if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+ if (!is_vendor_class(mad_hdr->mgmt_class)) {
class = port_priv->version[
- mad->mad_hdr.class_version].class;
+ mad_hdr->class_version].class;
if (!class)
goto out;
- if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+ if (convert_mgmt_class(mad_hdr->mgmt_class) >=
IB_MGMT_MAX_METHODS)
goto out;
method = class->method_table[convert_mgmt_class(
- mad->mad_hdr.mgmt_class)];
+ mad_hdr->mgmt_class)];
if (method)
- mad_agent = method->agent[mad->mad_hdr.method &
+ mad_agent = method->agent[mad_hdr->method &
~IB_MGMT_METHOD_RESP];
} else {
vendor = port_priv->version[
- mad->mad_hdr.class_version].vendor;
+ mad_hdr->class_version].vendor;
if (!vendor)
goto out;
vendor_class = vendor->vendor_class[vendor_class_index(
- mad->mad_hdr.mgmt_class)];
+ mad_hdr->mgmt_class)];
if (!vendor_class)
goto out;
/* Find matching OUI */
- vendor_mad = (struct ib_vendor_mad *)mad;
+ vendor_mad = (const struct ib_vendor_mad *)mad_hdr;
index = find_vendor_oui(vendor_class, vendor_mad->oui);
if (index == -1)
goto out;
method = vendor_class->method_table[index];
if (method) {
- mad_agent = method->agent[mad->mad_hdr.method &
+ mad_agent = method->agent[mad_hdr->method &
~IB_MGMT_METHOD_RESP];
}
}
@@ -1708,23 +1790,32 @@ out:
return mad_agent;
}
-static int validate_mad(struct ib_mad *mad, u32 qp_num)
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+ const struct ib_mad_qp_info *qp_info,
+ bool opa)
{
int valid = 0;
+ u32 qp_num = qp_info->qp->qp_num;
/* Make sure MAD base version is understood */
- if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
- pr_err("MAD received with unsupported base version %d\n",
- mad->mad_hdr.base_version);
+ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+ (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+ pr_err("MAD received with unsupported base version %d %s\n",
+ mad_hdr->base_version, opa ? "(opa)" : "");
goto out;
}
/* Filter SMI packets sent to other than QP0 */
- if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
- (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
if (qp_num == 0)
valid = 1;
} else {
+ /* CM attributes other than ClassPortInfo only use Send method */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) &&
+ (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) &&
+ (mad_hdr->method != IB_MGMT_METHOD_SEND))
+ goto out;
/* Filter GSI packets sent to QP0 */
if (qp_num != 0)
valid = 1;
@@ -1734,8 +1825,8 @@ out:
return valid;
}
-static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_hdr *mad_hdr)
+static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_hdr *mad_hdr)
{
struct ib_rmpp_mad *rmpp_mad;
@@ -1747,16 +1838,16 @@ static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
}
-static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
- struct ib_mad_recv_wc *rwc)
+static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc)
{
- return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+ return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class ==
rwc->recv_buf.mad->mad_hdr.mgmt_class;
}
-static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_send_wr_private *wr,
- struct ib_mad_recv_wc *rwc )
+static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc )
{
struct ib_ah_attr attr;
u8 send_resp, rcv_resp;
@@ -1765,8 +1856,8 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
u8 port_num = mad_agent_priv->agent.port_num;
u8 lmc;
- send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
- rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+ send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr);
if (send_resp == rcv_resp)
/* both requests, or both responses. GIDs different */
@@ -1791,7 +1882,7 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
((1 << lmc) - 1)));
} else {
if (ib_get_cached_gid(device, port_num,
- attr.grh.sgid_index, &sgid))
+ attr.grh.sgid_index, &sgid, NULL))
return 0;
return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
16);
@@ -1811,22 +1902,22 @@ static inline int is_direct(u8 class)
}
struct ib_mad_send_wr_private*
-ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_recv_wc *wc)
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *wc)
{
struct ib_mad_send_wr_private *wr;
- struct ib_mad *mad;
+ const struct ib_mad_hdr *mad_hdr;
- mad = (struct ib_mad *)wc->recv_buf.mad;
+ mad_hdr = &wc->recv_buf.mad->mad_hdr;
list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
- if ((wr->tid == mad->mad_hdr.tid) &&
+ if ((wr->tid == mad_hdr->tid) &&
rcv_has_same_class(wr, wc) &&
/*
* Don't check GID for direct routed MADs.
* These might have permissive LIDs.
*/
- (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ (is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
}
@@ -1836,15 +1927,15 @@ ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
* been notified that the send has completed
*/
list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
- if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
- wr->tid == mad->mad_hdr.tid &&
+ if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad_hdr->tid &&
wr->timeout &&
rcv_has_same_class(wr, wc) &&
/*
* Don't check GID for direct routed MADs.
* These might have permissive LIDs.
*/
- (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ (is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
/* Verify request has not been canceled */
return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
@@ -1879,7 +1970,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
/* Complete corresponding request */
- if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+ if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) {
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
if (!mad_send_wr) {
@@ -1924,26 +2015,163 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
}
-static bool generate_unmatched_resp(struct ib_mad_private *recv,
- struct ib_mad_private *response)
+static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
+ const struct ib_mad_qp_info *qp_info,
+ const struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
{
- if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
- recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
- memcpy(response, recv, sizeof *response);
+ enum smi_forward_action retsmi;
+ struct ib_smp *smp = (struct ib_smp *)recv->mad;
+
+ if (smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
response->header.recv_wc.wc = &response->header.wc;
- response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
response->header.recv_wc.recv_buf.grh = &response->grh;
- response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
- response->mad.mad.mad_hdr.status =
- cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
- if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
- response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ response->mad_size,
+ false);
+
+ return IB_SMI_DISCARD;
+ }
+ return IB_SMI_HANDLE;
+}
+
+static bool generate_unmatched_resp(const struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ size_t *resp_len, bool opa)
+{
+ const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
+ struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
+
+ if (recv_hdr->method == IB_MGMT_METHOD_GET ||
+ recv_hdr->method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+ resp_hdr->method = IB_MGMT_METHOD_GET_RESP;
+ resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ resp_hdr->status |= IB_SMP_DIRECTION;
+
+ if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+ if (recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ *resp_len = opa_get_smp_header_size(
+ (struct opa_smp *)recv->mad);
+ else
+ *resp_len = sizeof(struct ib_mad_hdr);
+ }
return true;
} else {
return false;
}
}
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+ if (opa_smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = opa_smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (opa_smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (opa_smi_check_local_smp(smp, port_priv->device) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.opa_mad =
+ (struct opa_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ opa_smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ recv->header.wc.byte_len,
+ true);
+
+ return IB_SMI_DISCARD;
+ }
+
+ return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ bool opa)
+{
+ struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+ if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+ mad_hdr->class_version == OPA_SMI_CLASS_VERSION)
+ return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+ response);
+
+ return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
struct ib_wc *wc)
{
@@ -1954,109 +2182,97 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
struct ib_mad_agent_private *mad_agent;
int port_num;
int ret = IB_MAD_RESULT_SUCCESS;
+ size_t mad_size;
+ u16 resp_mad_pkey_index = 0;
+ bool opa;
mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
qp_info = mad_list->mad_queue->qp_info;
dequeue_mad(mad_list);
+ opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+ qp_info->port_priv->port_num);
+
mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
mad_list);
recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
ib_dma_unmap_single(port_priv->device,
recv->header.mapping,
- sizeof(struct ib_mad_private) -
- sizeof(struct ib_mad_private_header),
+ mad_priv_dma_size(recv),
DMA_FROM_DEVICE);
/* Setup MAD receive work completion from "normal" work completion */
recv->header.wc = *wc;
recv->header.recv_wc.wc = &recv->header.wc;
- recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
- recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+
+ if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+ recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
recv->header.recv_wc.recv_buf.grh = &recv->grh;
if (atomic_read(&qp_info->snoop_count))
snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
/* Validate MAD */
- if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
goto out;
- response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ mad_size = recv->mad_size;
+ response = alloc_mad_private(mad_size, GFP_KERNEL);
if (!response) {
dev_err(&port_priv->device->dev,
"ib_mad_recv_done_handler no memory for response buffer\n");
goto out;
}
- if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+ if (rdma_cap_ib_switch(port_priv->device))
port_num = wc->port_num;
else
port_num = port_priv->port_num;
- if (recv->mad.mad.mad_hdr.mgmt_class ==
+ if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
- enum smi_forward_action retsmi;
-
- if (smi_handle_dr_smp_recv(&recv->mad.smp,
- port_priv->device->node_type,
- port_num,
- port_priv->device->phys_port_cnt) ==
- IB_SMI_DISCARD)
+ if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+ response, opa)
+ == IB_SMI_DISCARD)
goto out;
-
- retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
- if (retsmi == IB_SMI_LOCAL)
- goto local;
-
- if (retsmi == IB_SMI_SEND) { /* don't forward */
- if (smi_handle_dr_smp_send(&recv->mad.smp,
- port_priv->device->node_type,
- port_num) == IB_SMI_DISCARD)
- goto out;
-
- if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
- goto out;
- } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
- /* forward case for switches */
- memcpy(response, recv, sizeof(*response));
- response->header.recv_wc.wc = &response->header.wc;
- response->header.recv_wc.recv_buf.mad = &response->mad.mad;
- response->header.recv_wc.recv_buf.grh = &response->grh;
-
- agent_send_response(&response->mad.mad,
- &response->grh, wc,
- port_priv->device,
- smi_get_fwd_port(&recv->mad.smp),
- qp_info->qp->qp_num);
-
- goto out;
- }
}
-local:
/* Give driver "right of first refusal" on incoming MAD */
if (port_priv->device->process_mad) {
ret = port_priv->device->process_mad(port_priv->device, 0,
port_priv->port_num,
wc, &recv->grh,
- &recv->mad.mad,
- &response->mad.mad);
+ (const struct ib_mad_hdr *)recv->mad,
+ recv->mad_size,
+ (struct ib_mad_hdr *)response->mad,
+ &mad_size, &resp_mad_pkey_index);
+
+ if (opa)
+ wc->pkey_index = resp_mad_pkey_index;
+
if (ret & IB_MAD_RESULT_SUCCESS) {
if (ret & IB_MAD_RESULT_CONSUMED)
goto out;
if (ret & IB_MAD_RESULT_REPLY) {
- agent_send_response(&response->mad.mad,
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
&recv->grh, wc,
port_priv->device,
port_num,
- qp_info->qp->qp_num);
+ qp_info->qp->qp_num,
+ mad_size, opa);
goto out;
}
}
}
- mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+ mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
if (mad_agent) {
ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
/*
@@ -2065,17 +2281,17 @@ local:
*/
recv = NULL;
} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
- generate_unmatched_resp(recv, response)) {
- agent_send_response(&response->mad.mad, &recv->grh, wc,
- port_priv->device, port_num, qp_info->qp->qp_num);
+ generate_unmatched_resp(recv, response, &mad_size, opa)) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
+ port_priv->device, port_num,
+ qp_info->qp->qp_num, mad_size, opa);
}
out:
/* Post another receive request for this QP */
if (response) {
ib_mad_post_receive_mads(qp_info, response);
- if (recv)
- kmem_cache_free(ib_mad_cache, recv);
+ kfree(recv);
} else
ib_mad_post_receive_mads(qp_info, recv);
}
@@ -2246,7 +2462,7 @@ retry:
ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
if (queued_send_wr) {
- ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
&bad_send_wr);
if (ret) {
dev_err(&port_priv->device->dev,
@@ -2304,7 +2520,7 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
struct ib_send_wr *bad_send_wr;
mad_send_wr->retry = 0;
- ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
&bad_send_wr);
if (ret)
ib_mad_send_done_handler(port_priv, wc);
@@ -2411,7 +2627,8 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
agent_list) {
- if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+ if (is_rmpp_data_mad(mad_agent_priv,
+ mad_send_wr->send_buf.mad) &&
&mad_send_wr->send_buf == send_buf)
return mad_send_wr;
}
@@ -2468,10 +2685,14 @@ static void local_completions(struct work_struct *work)
int free_mad;
struct ib_wc wc;
struct ib_mad_send_wc mad_send_wc;
+ bool opa;
mad_agent_priv =
container_of(work, struct ib_mad_agent_private, local_work);
+ opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->local_list)) {
local = list_entry(mad_agent_priv->local_list.next,
@@ -2481,6 +2702,7 @@ static void local_completions(struct work_struct *work)
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
free_mad = 0;
if (local->mad_priv) {
+ u8 base_version;
recv_mad_agent = local->recv_mad_agent;
if (!recv_mad_agent) {
dev_err(&mad_agent_priv->agent.device->dev,
@@ -2496,17 +2718,26 @@ static void local_completions(struct work_struct *work)
build_smp_wc(recv_mad_agent->agent.qp,
(unsigned long) local->mad_send_wr,
be16_to_cpu(IB_LID_PERMISSIVE),
- 0, recv_mad_agent->agent.port_num, &wc);
+ local->mad_send_wr->send_wr.pkey_index,
+ recv_mad_agent->agent.port_num, &wc);
local->mad_priv->header.recv_wc.wc = &wc;
- local->mad_priv->header.recv_wc.mad_len =
- sizeof(struct ib_mad);
+
+ base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+ if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+ local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
&local->mad_priv->header.recv_wc.rmpp_list);
local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
local->mad_priv->header.recv_wc.recv_buf.mad =
- &local->mad_priv->mad.mad;
+ (struct ib_mad *)local->mad_priv->mad;
if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
snoop_recv(recv_mad_agent->qp_info,
&local->mad_priv->header.recv_wc,
@@ -2534,7 +2765,7 @@ local_send_completion:
spin_lock_irqsave(&mad_agent_priv->lock, flags);
atomic_dec(&mad_agent_priv->refcount);
if (free_mad)
- kmem_cache_free(ib_mad_cache, local->mad_priv);
+ kfree(local->mad_priv);
kfree(local);
}
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
@@ -2649,8 +2880,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
/* Initialize common scatter list fields */
- sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
- sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
/* Initialize common receive WR fields */
recv_wr.next = NULL;
@@ -2663,7 +2893,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
mad_priv = mad;
mad = NULL;
} else {
- mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
+ GFP_ATOMIC);
if (!mad_priv) {
dev_err(&qp_info->port_priv->device->dev,
"No memory for receive buffer\n");
@@ -2671,10 +2902,10 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
break;
}
}
+ sg_list.length = mad_priv_dma_size(mad_priv);
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
&mad_priv->grh,
- sizeof *mad_priv -
- sizeof mad_priv->header,
+ mad_priv_dma_size(mad_priv),
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
sg_list.addr))) {
@@ -2698,10 +2929,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
spin_unlock_irqrestore(&recv_queue->lock, flags);
ib_dma_unmap_single(qp_info->port_priv->device,
mad_priv->header.mapping,
- sizeof *mad_priv -
- sizeof mad_priv->header,
+ mad_priv_dma_size(mad_priv),
DMA_FROM_DEVICE);
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
dev_err(&qp_info->port_priv->device->dev,
"ib_post_recv failed: %d\n", ret);
break;
@@ -2738,10 +2968,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
ib_dma_unmap_single(qp_info->port_priv->device,
recv->header.mapping,
- sizeof(struct ib_mad_private) -
- sizeof(struct ib_mad_private_header),
+ mad_priv_dma_size(recv),
DMA_FROM_DEVICE);
- kmem_cache_free(ib_mad_cache, recv);
+ kfree(recv);
}
qp_info->recv_queue.count = 0;
@@ -2922,6 +3151,14 @@ static int ib_mad_port_open(struct ib_device *device,
unsigned long flags;
char name[sizeof "ib_mad123"];
int has_smi;
+ struct ib_cq_init_attr cq_attr = {};
+
+ if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ if (WARN_ON(rdma_cap_opa_mad(device, port_num) &&
+ rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE))
+ return -EFAULT;
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -2938,13 +3175,14 @@ static int ib_mad_port_open(struct ib_device *device,
init_mad_qp(port_priv, &port_priv->qp_info[1]);
cq_size = mad_sendq_size + mad_recvq_size;
- has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+ has_smi = rdma_cap_ib_smi(device, port_num);
if (has_smi)
cq_size *= 2;
+ cq_attr.cqe = cq_size;
port_priv->cq = ib_create_cq(port_priv->device,
ib_mad_thread_completion_handler,
- NULL, port_priv, cq_size, 0);
+ NULL, port_priv, &cq_attr);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
@@ -2958,13 +3196,6 @@ static int ib_mad_port_open(struct ib_device *device,
goto error4;
}
- port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(port_priv->mr)) {
- dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
- ret = PTR_ERR(port_priv->mr);
- goto error5;
- }
-
if (has_smi) {
ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
if (ret)
@@ -3005,8 +3236,6 @@ error8:
error7:
destroy_mad_qp(&port_priv->qp_info[0]);
error6:
- ib_dereg_mr(port_priv->mr);
-error5:
ib_dealloc_pd(port_priv->pd);
error4:
ib_destroy_cq(port_priv->cq);
@@ -3041,7 +3270,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
destroy_workqueue(port_priv->wq);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
- ib_dereg_mr(port_priv->mr);
ib_dealloc_pd(port_priv->pd);
ib_destroy_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3055,20 +3283,14 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
static void ib_mad_init_device(struct ib_device *device)
{
- int start, end, i;
+ int start, i;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ start = rdma_start_port(device);
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- start = 0;
- end = 0;
- } else {
- start = 1;
- end = device->phys_port_cnt;
- }
+ for (i = start; i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- for (i = start; i <= end; i++) {
if (ib_mad_port_open(device, i)) {
dev_err(&device->dev, "Couldn't open port %d\n", i);
goto error;
@@ -3086,40 +3308,31 @@ error_agent:
dev_err(&device->dev, "Couldn't close port %d\n", i);
error:
- i--;
+ while (--i >= start) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- while (i >= start) {
if (ib_agent_port_close(device, i))
dev_err(&device->dev,
"Couldn't close port %d for agents\n", i);
if (ib_mad_port_close(device, i))
dev_err(&device->dev, "Couldn't close port %d\n", i);
- i--;
}
}
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
- int i, num_ports, cur_port;
+ int i;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- num_ports = 1;
- cur_port = 0;
- } else {
- num_ports = device->phys_port_cnt;
- cur_port = 1;
- }
- for (i = 0; i < num_ports; i++, cur_port++) {
- if (ib_agent_port_close(device, cur_port))
+ if (ib_agent_port_close(device, i))
dev_err(&device->dev,
- "Couldn't close port %d for agents\n",
- cur_port);
- if (ib_mad_port_close(device, cur_port))
- dev_err(&device->dev, "Couldn't close port %d\n",
- cur_port);
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
}
}
@@ -3131,45 +3344,25 @@ static struct ib_client mad_client = {
static int __init ib_mad_init_module(void)
{
- int ret;
-
mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
- ib_mad_cache = kmem_cache_create("ib_mad",
- sizeof(struct ib_mad_private),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!ib_mad_cache) {
- pr_err("Couldn't create ib_mad cache\n");
- ret = -ENOMEM;
- goto error1;
- }
-
INIT_LIST_HEAD(&ib_mad_port_list);
if (ib_register_client(&mad_client)) {
pr_err("Couldn't register ib_mad client\n");
- ret = -EINVAL;
- goto error2;
+ return -EINVAL;
}
return 0;
-
-error2:
- kmem_cache_destroy(ib_mad_cache);
-error1:
- return ret;
}
static void __exit ib_mad_cleanup_module(void)
{
ib_unregister_client(&mad_client);
- kmem_cache_destroy(ib_mad_cache);
}
module_init(ib_mad_init_module);
diff --git a/kernel/drivers/infiniband/core/mad_priv.h b/kernel/drivers/infiniband/core/mad_priv.h
index d1a0b0ee9..990698a6a 100644
--- a/kernel/drivers/infiniband/core/mad_priv.h
+++ b/kernel/drivers/infiniband/core/mad_priv.h
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
@@ -56,7 +57,7 @@
/* Registration table sizes */
#define MAX_MGMT_CLASS 80
-#define MAX_MGMT_VERSION 8
+#define MAX_MGMT_VERSION 0x83
#define MAX_MGMT_OUI 8
#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
@@ -75,12 +76,9 @@ struct ib_mad_private_header {
struct ib_mad_private {
struct ib_mad_private_header header;
+ size_t mad_size;
struct ib_grh grh;
- union {
- struct ib_mad mad;
- struct ib_rmpp_mad rmpp_mad;
- struct ib_smp smp;
- } mad;
+ u8 mad[0];
} __attribute__ ((packed));
struct ib_rmpp_segment {
@@ -125,7 +123,7 @@ struct ib_mad_send_wr_private {
struct ib_mad_send_buf send_buf;
u64 header_mapping;
u64 payload_mapping;
- struct ib_send_wr send_wr;
+ struct ib_ud_wr send_wr;
struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
__be64 tid;
unsigned long timeout;
@@ -150,6 +148,7 @@ struct ib_mad_local_private {
struct ib_mad_private *mad_priv;
struct ib_mad_agent_private *recv_mad_agent;
struct ib_mad_send_wr_private *mad_send_wr;
+ size_t return_wc_byte_len;
};
struct ib_mad_mgmt_method_table {
@@ -200,7 +199,6 @@ struct ib_mad_port_private {
int port_num;
struct ib_cq *cq;
struct ib_pd *pd;
- struct ib_mr *mr;
spinlock_t reg_lock;
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
@@ -213,8 +211,8 @@ struct ib_mad_port_private {
int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
struct ib_mad_send_wr_private *
-ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_recv_wc *mad_recv_wc);
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *mad_recv_wc);
void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
struct ib_mad_send_wc *mad_send_wc);
diff --git a/kernel/drivers/infiniband/core/mad_rmpp.c b/kernel/drivers/infiniband/core/mad_rmpp.c
index f37878c9c..382941b46 100644
--- a/kernel/drivers/infiniband/core/mad_rmpp.c
+++ b/kernel/drivers/infiniband/core/mad_rmpp.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2005 Intel Inc. All rights reserved.
* Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -67,6 +68,7 @@ struct mad_rmpp_recv {
u8 mgmt_class;
u8 class_version;
u8 method;
+ u8 base_version;
};
static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
@@ -139,7 +141,8 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
recv_wc->wc->pkey_index, 1, hdr_len,
- 0, GFP_KERNEL);
+ 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(msg))
return;
@@ -165,7 +168,8 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
recv_wc->wc->pkey_index, 1,
- hdr_len, 0, GFP_KERNEL);
+ hdr_len, 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(msg))
ib_destroy_ah(ah);
else {
@@ -316,6 +320,7 @@ create_rmpp_recv(struct ib_mad_agent_private *agent,
rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
rmpp_recv->class_version = mad_hdr->class_version;
rmpp_recv->method = mad_hdr->method;
+ rmpp_recv->base_version = mad_hdr->base_version;
return rmpp_recv;
error: kfree(rmpp_recv);
@@ -431,14 +436,23 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
{
struct ib_rmpp_mad *rmpp_mad;
int hdr_size, data_size, pad;
+ bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+ rmpp_recv->agent->qp_info->port_priv->port_num);
rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
- data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
- pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
- if (pad > IB_MGMT_RMPP_DATA || pad < 0)
- pad = 0;
+ if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+ data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+ pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ } else {
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ }
return hdr_size + rmpp_recv->seg_num * data_size - pad;
}
@@ -570,13 +584,14 @@ static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
if (mad_send_wr->seg_num == 1) {
rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
- paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
- mad_send_wr->pad;
+ paylen = (mad_send_wr->send_buf.seg_count *
+ mad_send_wr->send_buf.seg_rmpp_size) -
+ mad_send_wr->pad;
}
if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
- paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+ paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad;
}
rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
diff --git a/kernel/drivers/infiniband/core/multicast.c b/kernel/drivers/infiniband/core/multicast.c
index fa17b552f..bb6685fb0 100644
--- a/kernel/drivers/infiniband/core/multicast.c
+++ b/kernel/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
#include "sa.h"
static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
static struct ib_client mcast_client = {
.name = "ib_multicast",
@@ -729,7 +729,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
u16 gid_index;
u8 p;
- ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+ ret = ib_find_cached_gid(device, &rec->port_gid,
+ NULL, &p, &gid_index);
if (ret)
return ret;
@@ -780,8 +781,7 @@ static void mcast_event_handler(struct ib_event_handler *handler,
int index;
dev = container_of(handler, struct mcast_device, event_handler);
- if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
- IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_mcast(dev->device, event->element.port_num))
return;
index = event->element.port_num - dev->start_port;
@@ -808,24 +808,16 @@ static void mcast_add_one(struct ib_device *device)
int i;
int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
GFP_KERNEL);
if (!dev)
return;
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- dev->start_port = dev->end_port = 0;
- else {
- dev->start_port = 1;
- dev->end_port = device->phys_port_cnt;
- }
+ dev->start_port = rdma_start_port(device);
+ dev->end_port = rdma_end_port(device);
for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- if (rdma_port_get_link_layer(device, dev->start_port + i) !=
- IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_mcast(device, dev->start_port + i))
continue;
port = &dev->port[i];
port->dev = dev;
@@ -849,13 +841,12 @@ static void mcast_add_one(struct ib_device *device)
ib_register_event_handler(&dev->event_handler);
}
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
{
- struct mcast_device *dev;
+ struct mcast_device *dev = client_data;
struct mcast_port *port;
int i;
- dev = ib_get_client_data(device, &mcast_client);
if (!dev)
return;
@@ -863,8 +854,7 @@ static void mcast_remove_one(struct ib_device *device)
flush_workqueue(mcast_wq);
for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- if (rdma_port_get_link_layer(device, dev->start_port + i) ==
- IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_mcast(device, dev->start_port + i)) {
port = &dev->port[i];
deref_port(port);
wait_for_completion(&port->comp);
diff --git a/kernel/drivers/infiniband/core/netlink.c b/kernel/drivers/infiniband/core/netlink.c
index 23dd5a5c7..d47df9356 100644
--- a/kernel/drivers/infiniband/core/netlink.c
+++ b/kernel/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
static struct sock *nls;
static LIST_HEAD(client_list);
+int ibnl_chk_listeners(unsigned int group)
+{
+ if (netlink_has_listeners(nls, group) == 0)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
int ibnl_add_client(int index, int nops,
const struct ibnl_client_cbs cb_table[])
{
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
!client->cb_table[op].dump)
return -EINVAL;
+ /*
+ * For response or local service set_timeout request,
+ * there is no need to use netlink_dump_start.
+ */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ (index == RDMA_NL_LS &&
+ op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+ struct netlink_callback cb = {
+ .skb = skb,
+ .nlh = nlh,
+ .dump = client->cb_table[op].dump,
+ .module = client->cb_table[op].module,
+ };
+
+ return cb.dump(skb, &cb);
+ }
+
{
struct netlink_dump_control c = {
.dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
}
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+ int msglen;
+
+ /*
+ * Process responses until there is no more message or the first
+ * request. Generally speaking, it is not recommended to mix responses
+ * with requests.
+ */
+ while (skb->len >= nlmsg_total_size(0)) {
+ nlh = nlmsg_hdr(skb);
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return;
+
+ /* Handle response only */
+ if (nlh->nlmsg_flags & NLM_F_REQUEST)
+ return;
+
+ ibnl_rcv_msg(skb, nlh);
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+}
+
static void ibnl_rcv(struct sk_buff *skb)
{
mutex_lock(&ibnl_mutex);
+ ibnl_rcv_reply_skb(skb);
netlink_rcv_skb(skb, &ibnl_rcv_msg);
mutex_unlock(&ibnl_mutex);
}
diff --git a/kernel/drivers/infiniband/core/opa_smi.h b/kernel/drivers/infiniband/core/opa_smi.h
new file mode 100644
index 000000000..3bfab3505
--- /dev/null
+++ b/kernel/drivers/infiniband/core/opa_smi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __OPA_SMI_H_
+#define __OPA_SMI_H_
+
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#include "smi.h"
+
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt);
+int opa_smi_get_fwd_port(struct opa_smp *smp);
+extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
+extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return (device->process_mad &&
+ !opa_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return (device->process_mad &&
+ opa_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+#endif /* __OPA_SMI_H_ */
diff --git a/kernel/drivers/infiniband/core/roce_gid_mgmt.c b/kernel/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000..178f98482
--- /dev/null
+++ b/kernel/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct update_gid_event_work {
+ struct work_struct work;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ 3
+struct netdev_event_work_cmd {
+ roce_netdev_callback cb;
+ roce_netdev_filter filter;
+ struct net_device *ndev;
+ struct net_device *filter_ndev;
+};
+
+struct netdev_event_work {
+ struct work_struct work;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u8 port, union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+ break;
+ }
+}
+
+enum bonding_slave_state {
+ BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
+ BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
+ /* No primary slave or the device isn't a slave in bonding */
+ BONDING_SLAVE_STATE_NA = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ if (upper && netif_is_bond_master(upper)) {
+ struct net_device *pdev =
+ bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+ if (pdev)
+ return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+ BONDING_SLAVE_STATE_INACTIVE;
+ }
+
+ return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+ struct net_device *_upper = NULL;
+ struct list_head *iter;
+
+ netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+ if (_upper == upper)
+ break;
+
+ return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
+ BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ struct net_device *real_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ real_dev = rdma_vlan_dev_real_dev(event_ndev);
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+ REQUIRED_BOND_STATES)) ||
+ real_dev == rdma_ndev);
+
+ rcu_read_unlock();
+ return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *master_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE;
+ rcu_read_unlock();
+
+ return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ if (rdma_ndev == event_ndev)
+ return 1;
+
+ rcu_read_lock();
+ res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+ rcu_read_unlock();
+
+ return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+ struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev,
+ struct sockaddr *addr)
+{
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+
+ rdma_ip2gid(addr, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port, struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ rcu_read_lock();
+ if (!rdma_ndev ||
+ ((rdma_ndev != event_ndev &&
+ !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+ netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+ BONDING_SLAVE_STATE_INACTIVE)) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+ if (!rdma_ndev)
+ return;
+
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ rcu_read_lock();
+
+ if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE) {
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct in_device *in_dev;
+ struct sin_list {
+ struct list_head list;
+ struct sockaddr_in ip;
+ };
+ struct sin_list *sin_iter;
+ struct sin_list *sin_temp;
+
+ LIST_HEAD(sin_list);
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(ndev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for_ifa(in_dev) {
+ struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry) {
+ pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv4 update\n");
+ continue;
+ }
+ entry->ip.sin_family = AF_INET;
+ entry->ip.sin_addr.s_addr = ifa->ifa_address;
+ list_add_tail(&entry->list, &sin_list);
+ }
+ endfor_ifa(in_dev);
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
+ update_gid_ip(GID_ADD, ib_dev, port, ndev,
+ (struct sockaddr *)&sin_iter->ip);
+ list_del(&sin_iter->list);
+ kfree(sin_iter);
+ }
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *in6_dev;
+ struct sin6_list {
+ struct list_head list;
+ struct sockaddr_in6 sin6;
+ };
+ struct sin6_list *sin6_iter;
+ struct sin6_list *sin6_temp;
+ struct ib_gid_attr gid_attr = {.ndev = ndev};
+ LIST_HEAD(sin6_list);
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in6_dev = in6_dev_get(ndev);
+ if (!in6_dev)
+ return;
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry) {
+ pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+ continue;
+ }
+
+ entry->sin6.sin6_family = AF_INET6;
+ entry->sin6.sin6_addr = ifp->addr;
+ list_add_tail(&entry->list, &sin6_list);
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+
+ list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+ union ib_gid gid;
+
+ rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+ update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+ list_del(&sin6_iter->list);
+ kfree(sin6_iter);
+ }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ enum_netdev_ipv4_ips(ib_dev, port, ndev);
+ if (IS_ENABLED(CONFIG_IPV6))
+ enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+ _add_netdev_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net *net;
+ struct net_device *ndev;
+
+ /* Lock the rtnl to make sure the netdevs does not move under
+ * our feet
+ */
+ rtnl_lock();
+ for_each_net(net)
+ for_each_netdev(net, ndev)
+ if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+ add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+ rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+ ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+ enum_all_gids_of_dev_cb, NULL);
+
+ return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct update_gid_event_work *parsed = cookie;
+
+ return update_gid(parsed->gid_op, device,
+ port, &parsed->gid,
+ &parsed->gid_attr);
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+ void *cookie,
+ void (*handle_netdev)(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *ndev))
+{
+ struct net_device *ndev = (struct net_device *)cookie;
+ struct upper_list {
+ struct list_head list;
+ struct net_device *upper;
+ };
+ struct net_device *upper;
+ struct list_head *iter;
+ struct upper_list *upper_iter;
+ struct upper_list *upper_temp;
+ LIST_HEAD(upper_list);
+
+ rcu_read_lock();
+ netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+ struct upper_list *entry = kmalloc(sizeof(*entry),
+ GFP_ATOMIC);
+
+ if (!entry) {
+ pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+ continue;
+ }
+
+ list_add_tail(&entry->list, &upper_list);
+ dev_hold(upper);
+ entry->upper = upper;
+ }
+ rcu_read_unlock();
+
+ handle_netdev(ib_dev, port, ndev);
+ list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+ list) {
+ handle_netdev(ib_dev, port, upper_iter->upper);
+ dev_put(upper_iter->upper);
+ list_del(&upper_iter->list);
+ kfree(upper_iter);
+ }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *event_ndev)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *master_ndev;
+
+ rcu_read_lock();
+ master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ if (master_ndev)
+ dev_hold(master_ndev);
+ rcu_read_unlock();
+
+ if (master_ndev) {
+ bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+ rdma_ndev);
+ dev_put(master_ndev);
+ }
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+ struct netdev_event_work *work =
+ container_of(_work, struct netdev_event_work, work);
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+ ib_enum_all_roce_netdevs(work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
+ dev_put(work->cmds[i].ndev);
+ dev_put(work->cmds[i].filter_ndev);
+ }
+
+ kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
+{
+ unsigned int i;
+ struct netdev_event_work *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+ if (!ndev_work) {
+ pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+ return NOTIFY_DONE;
+ }
+
+ memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+ for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+ if (!ndev_work->cmds[i].ndev)
+ ndev_work->cmds[i].ndev = ndev;
+ if (!ndev_work->cmds[i].filter_ndev)
+ ndev_work->cmds[i].filter_ndev = ndev;
+ dev_hold(ndev_work->cmds[i].ndev);
+ dev_hold(ndev_work->cmds[i].filter_ndev);
+ }
+ INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+ queue_work(ib_wq, &ndev_work->work);
+
+ return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+ .cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+ .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+ if (changeupper_info->linking == false) {
+ cmds[0] = upper_ips_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd;
+ } else {
+ cmds[0] = bonding_default_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd_upper_ips;
+ cmds[1].ndev = changeupper_info->upper_dev;
+ cmds[1].filter_ndev = changeupper_info->upper_dev;
+ }
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ static const struct netdev_event_work_cmd del_cmd = {
+ .cb = del_netdev_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+ .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+ static const struct netdev_event_work_cmd default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ cmds[0] = bonding_default_del_cmd_join;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (ndev->reg_state < NETREG_UNREGISTERED)
+ cmds[0] = del_cmd;
+ else
+ return NOTIFY_DONE;
+ break;
+
+ case NETDEV_CHANGEADDR:
+ cmds[0] = default_del_cmd;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ netdevice_event_changeupper(
+ container_of(ptr, struct netdev_notifier_changeupper_info, info),
+ cmds);
+ break;
+
+ case NETDEV_BONDING_FAILOVER:
+ cmds[0] = bonding_event_ips_del_cmd;
+ cmds[1] = bonding_default_del_cmd_join;
+ cmds[2] = add_cmd_upper_ips;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+ struct update_gid_event_work *work =
+ container_of(_work, struct update_gid_event_work, work);
+
+ ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+ callback_for_addr_gid_device_scan, work);
+
+ dev_put(work->gid_attr.ndev);
+ kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+ struct sockaddr *sa, struct net_device *ndev)
+{
+ struct update_gid_event_work *work;
+ enum gid_op_type gid_op;
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ gid_op = GID_ADD;
+ break;
+
+ case NETDEV_DOWN:
+ gid_op = GID_DEL;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work) {
+ pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+ return NOTIFY_DONE;
+ }
+
+ INIT_WORK(&work->work, update_gid_event_work_handler);
+
+ rdma_ip2gid(sa, &work->gid);
+ work->gid_op = gid_op;
+
+ memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+ dev_hold(ndev);
+ work->gid_attr.ndev = ndev;
+
+ queue_work(ib_wq, &work->work);
+
+ return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in in;
+ struct net_device *ndev;
+ struct in_ifaddr *ifa = ptr;
+
+ in.sin_family = AF_INET;
+ in.sin_addr.s_addr = ifa->ifa_address;
+ ndev = ifa->ifa_dev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in6 in6;
+ struct net_device *ndev;
+ struct inet6_ifaddr *ifa6 = ptr;
+
+ in6.sin6_family = AF_INET6;
+ in6.sin6_addr = ifa6->addr;
+ ndev = ifa6->idev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+ .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+ register_inetaddr_notifier(&nb_inetaddr);
+ if (IS_ENABLED(CONFIG_IPV6))
+ register_inet6addr_notifier(&nb_inet6addr);
+ /* We relay on the netdevice notifier to enumerate all
+ * existing devices in the system. Register to this notifier
+ * last to make sure we will not miss any IP add/del
+ * callbacks.
+ */
+ register_netdevice_notifier(&nb_netdevice);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6))
+ unregister_inet6addr_notifier(&nb_inet6addr);
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_netdevice);
+ /* Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+}
diff --git a/kernel/drivers/infiniband/core/sa_query.c b/kernel/drivers/infiniband/core/sa_query.c
index c38f030f0..a95a32ba5 100644
--- a/kernel/drivers/infiniband/core/sa_query.c
+++ b/kernel/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
#include <uapi/linux/if_ether.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
#include "sa.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand subnet administration query support");
MODULE_LICENSE("Dual BSD/GPL");
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
struct ib_sa_sm_ah {
struct ib_ah *ah;
struct kref ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
struct ib_mad_send_buf *mad_buf;
struct ib_sa_sm_ah *sm_ah;
int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
};
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+
struct ib_sa_service_query {
void (*callback)(int, struct ib_sa_service_rec *, void *);
void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
struct ib_sa_query sa_query;
};
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
+ .len = sizeof(struct ib_path_rec_data)},
+ [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
+ [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
+ [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
+ [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
+};
+
+
static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
static struct ib_client sa_client = {
.name = "sa",
@@ -381,6 +418,429 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+ return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+ struct ib_sa_query *query)
+{
+ struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+ u16 val16;
+ u64 val64;
+ struct rdma_ls_resolve_header *header;
+
+ query->mad_buf->context[1] = NULL;
+
+ /* Construct the family header first */
+ header = (struct rdma_ls_resolve_header *)
+ skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ memcpy(header->device_name, query->port->agent->device->name,
+ LS_DEVICE_NAME_MAX);
+ header->port_num = query->port->port_num;
+
+ if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+ sa_rec->reversible != 0)
+ query->path_use = LS_RESOLVE_PATH_USE_GMP;
+ else
+ query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+ header->path_use = query->path_use;
+
+ /* Now build the attributes */
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+ val64 = be64_to_cpu(sa_rec->service_id);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+ sizeof(val64), &val64);
+ }
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+ sizeof(sa_rec->dgid), &sa_rec->dgid);
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+ sizeof(sa_rec->sgid), &sa_rec->sgid);
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+ sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+ if (comp_mask & IB_SA_PATH_REC_PKEY) {
+ val16 = be16_to_cpu(sa_rec->pkey);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+ sizeof(val16), &val16);
+ }
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+ val16 = be16_to_cpu(sa_rec->qos_class);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+ sizeof(val16), &val16);
+ }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+ int len = 0;
+
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+ len += nla_total_size(sizeof(u64));
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ len += nla_total_size(sizeof(u8));
+ if (comp_mask & IB_SA_PATH_REC_PKEY)
+ len += nla_total_size(sizeof(u16));
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+ len += nla_total_size(sizeof(u16));
+
+ /*
+ * Make sure that at least some of the required comp_mask bits are
+ * set.
+ */
+ if (WARN_ON(len == 0))
+ return len;
+
+ /* Add the family header */
+ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+ return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *data;
+ int ret = 0;
+ struct ib_sa_mad *mad;
+ int len;
+
+ mad = query->mad_buf->mad;
+ len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+ if (len <= 0)
+ return -EMSGSIZE;
+
+ skb = nlmsg_new(len, gfp_mask);
+ if (!skb)
+ return -ENOMEM;
+
+ /* Put nlmsg header only for now */
+ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /* Add attributes */
+ ib_nl_set_path_rec_attrs(skb, query);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+
+ ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask);
+ if (!ret)
+ ret = len;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ unsigned long delay;
+ int ret;
+
+ INIT_LIST_HEAD(&query->list);
+ query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+ /* Put the request on the list first.*/
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+ query->timeout = delay + jiffies;
+ list_add_tail(&query->list, &ib_nl_request_list);
+ /* Start the timeout if this is the only request */
+ if (ib_nl_request_list.next == &query->list)
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ ret = ib_nl_send_msg(query, gfp_mask);
+ if (ret <= 0) {
+ ret = -EIO;
+ /* Remove the request */
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_del(&query->list);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ } else {
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_sa_query *wait_query;
+ int found = 0;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+ /* Let the timeout to take care of the callback */
+ if (query == wait_query) {
+ query->flags |= IB_SA_CANCEL;
+ query->timeout = jiffies;
+ list_move(&query->list, &ib_nl_request_list);
+ found = 1;
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+ const struct nlmsghdr *nlh)
+{
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_sa_mad *mad = NULL;
+ const struct nlattr *head, *curr;
+ struct ib_path_rec_data *rec;
+ int len, rem;
+ u32 mask = 0;
+ int status = -EIO;
+
+ if (query->callback) {
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+ rec = nla_data(curr);
+ /*
+ * Get the first one. In the future, we may
+ * need to get up to 6 pathrecords.
+ */
+ if ((rec->flags & mask) == mask) {
+ mad = query->mad_buf->mad;
+ mad->mad_hdr.method |=
+ IB_MGMT_METHOD_RESP;
+ memcpy(mad->data, rec->path_rec,
+ sizeof(rec->path_rec));
+ status = 0;
+ break;
+ }
+ }
+ }
+ query->callback(query, status, mad);
+ }
+
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_SUCCESS;
+ send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ unsigned long delay;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ while (!list_empty(&ib_nl_request_list)) {
+ query = list_entry(ib_nl_request_list.next,
+ struct ib_sa_query, list);
+
+ if (time_after(query->timeout, jiffies)) {
+ delay = query->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ break;
+ }
+
+ list_del(&query->list);
+ ib_sa_disable_local_svc(query);
+ /* Hold the lock to protect against query cancellation */
+ if (ib_sa_query_cancelled(query))
+ ret = -1;
+ else
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ send_handler(query->port->agent, &mad_send_wc);
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ int timeout, delta, abs_delta;
+ const struct nlattr *attr;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ long delay = 0;
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+ if (ret || !attr)
+ goto settimeout_out;
+
+ timeout = *(int *) nla_data(attr);
+ if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+ if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+ delta = timeout - sa_local_svc_timeout_ms;
+ if (delta < 0)
+ abs_delta = -delta;
+ else
+ abs_delta = delta;
+
+ if (delta != 0) {
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ sa_local_svc_timeout_ms = timeout;
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ if (delta < 0 && abs_delta > query->timeout)
+ query->timeout = 0;
+ else
+ query->timeout += delta;
+
+ /* Get the new delay from the first entry */
+ if (!delay) {
+ delay = query->timeout - jiffies;
+ if (delay <= 0)
+ delay = 1;
+ }
+ }
+ if (delay)
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+ (unsigned long)delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ }
+
+settimeout_out:
+ return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return 0;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_send_wc mad_send_wc;
+ int found = 0;
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ /*
+ * If the query is cancelled, let the timeout routine
+ * take care of it.
+ */
+ if (nlh->nlmsg_seq == query->seq) {
+ found = !ib_sa_query_cancelled(query);
+ if (found)
+ list_del(&query->list);
+ break;
+ }
+ }
+
+ if (!found) {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ goto resp_out;
+ }
+
+ send_buf = query->mad_buf;
+
+ if (!ib_nl_is_good_resolve_resp(nlh)) {
+ /* if the result is a failure, send out the packet via IB */
+ ib_sa_disable_local_svc(query);
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ if (ret) {
+ mad_send_wc.send_buf = send_buf;
+ mad_send_wc.status = IB_WC_GENERAL_ERR;
+ send_handler(query->port->agent, &mad_send_wc);
+ }
+ } else {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ ib_nl_process_good_resolve_rsp(query, nlh);
+ }
+
+resp_out:
+ return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .dump = ib_nl_handle_resolve_resp,
+ .module = THIS_MODULE },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .dump = ib_nl_handle_set_timeout,
+ .module = THIS_MODULE },
+};
+
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -450,7 +910,7 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
struct ib_sa_port *port =
&sa_dev->port[event->element.port_num - sa_dev->start_port];
- if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_sa(handler->device, port->port_num))
return;
spin_lock_irqsave(&port->ah_lock, flags);
@@ -502,7 +962,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
mad_buf = query->mad_buf;
spin_unlock_irqrestore(&idr_lock, flags);
- ib_cancel_mad(agent, mad_buf);
+ /*
+ * If the query is still on the netlink request list, schedule
+ * it to be cancelled by the timeout routine. Otherwise, it has been
+ * sent to the MAD layer and has to be cancelled from there.
+ */
+ if (!ib_nl_cancel_request(query))
+ ib_cancel_mad(agent, mad_buf);
}
EXPORT_SYMBOL(ib_sa_cancel_query);
@@ -540,29 +1006,32 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
ah_attr->port_num = port_num;
ah_attr->static_rate = rec->rate;
- force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+ force_grh = rdma_cap_eth_ah(device, port_num);
if (rec->hop_limit > 1 || force_grh) {
+ struct net_device *ndev = ib_get_ndev_from_path(rec);
+
ah_attr->ah_flags = IB_AH_GRH;
ah_attr->grh.dgid = rec->dgid;
- ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+ ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
&gid_index);
- if (ret)
+ if (ret) {
+ if (ndev)
+ dev_put(ndev);
return ret;
+ }
ah_attr->grh.sgid_index = gid_index;
ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
ah_attr->grh.hop_limit = rec->hop_limit;
ah_attr->grh.traffic_class = rec->traffic_class;
+ if (ndev)
+ dev_put(ndev);
}
if (force_grh) {
memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
- ah_attr->vlan_id = rec->vlan_id;
- } else {
- ah_attr->vlan_id = 0xffff;
}
-
return 0;
}
EXPORT_SYMBOL(ib_init_ah_from_path);
@@ -583,7 +1052,8 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
query->mad_buf = ib_create_send_mad(query->port->agent, 1,
query->sm_ah->pkey_index,
0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
- gfp_mask);
+ gfp_mask,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(query->mad_buf)) {
kref_put(&query->sm_ah->ref, free_sm_ah);
return -ENOMEM;
@@ -618,7 +1088,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
{
- bool preload = !!(gfp_mask & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp_mask);
unsigned long flags;
int ret, id;
@@ -638,6 +1108,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
query->mad_buf->context[0] = query;
query->id = id;
+ if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+ if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+ if (!ib_nl_make_request(query, gfp_mask))
+ return id;
+ }
+ ib_sa_disable_local_svc(query);
+ }
+
ret = ib_post_send_mad(query->mad_buf, NULL);
if (ret) {
spin_lock_irqsave(&idr_lock, flags);
@@ -677,9 +1155,9 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
mad->data, &rec);
- rec.vlan_id = 0xffff;
+ rec.net = NULL;
+ rec.ifindex = 0;
memset(rec.dmac, 0, ETH_ALEN);
- memset(rec.smac, 0, ETH_ALEN);
query->callback(status, &rec, query->context);
} else
query->callback(status, NULL, query->context);
@@ -739,7 +1217,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -766,6 +1244,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
*sa_query = &query->sa_query;
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = rec;
+
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
goto err2;
@@ -861,7 +1342,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
method != IB_SA_METHOD_DELETE)
return -EINVAL;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -953,7 +1434,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1050,7 +1531,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1153,16 +1634,10 @@ static void ib_sa_add_one(struct ib_device *device)
{
struct ib_sa_device *sa_dev;
int s, e, i;
+ int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- s = e = 0;
- else {
- s = 1;
- e = device->phys_port_cnt;
- }
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
sa_dev = kzalloc(sizeof *sa_dev +
(e - s + 1) * sizeof (struct ib_sa_port),
@@ -1175,7 +1650,7 @@ static void ib_sa_add_one(struct ib_device *device)
for (i = 0; i <= e - s; ++i) {
spin_lock_init(&sa_dev->port[i].ah_lock);
- if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_sa(device, i + 1))
continue;
sa_dev->port[i].sm_ah = NULL;
@@ -1189,8 +1664,13 @@ static void ib_sa_add_one(struct ib_device *device)
goto err;
INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+
+ count++;
}
+ if (!count)
+ goto free;
+
ib_set_client_data(device, &sa_client, sa_dev);
/*
@@ -1204,25 +1684,26 @@ static void ib_sa_add_one(struct ib_device *device)
if (ib_register_event_handler(&sa_dev->event_handler))
goto err;
- for (i = 0; i <= e - s; ++i)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ for (i = 0; i <= e - s; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1))
update_sm_ah(&sa_dev->port[i].update_task);
+ }
return;
err:
- while (--i >= 0)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ while (--i >= 0) {
+ if (rdma_cap_ib_sa(device, i + 1))
ib_unregister_mad_agent(sa_dev->port[i].agent);
-
+ }
+free:
kfree(sa_dev);
-
return;
}
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_device *sa_dev = client_data;
int i;
if (!sa_dev)
@@ -1233,7 +1714,7 @@ static void ib_sa_remove_one(struct ib_device *device)
flush_workqueue(ib_wq);
for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_sa(device, i + 1)) {
ib_unregister_mad_agent(sa_dev->port[i].agent);
if (sa_dev->port[i].sm_ah)
kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
@@ -1250,6 +1731,8 @@ static int __init ib_sa_init(void)
get_random_bytes(&tid, sizeof tid);
+ atomic_set(&ib_nl_sa_request_seq, 0);
+
ret = ib_register_client(&sa_client);
if (ret) {
printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1262,7 +1745,25 @@ static int __init ib_sa_init(void)
goto err2;
}
+ ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+ if (!ib_nl_wq) {
+ ret = -ENOMEM;
+ goto err3;
+ }
+
+ if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+ ib_sa_cb_table)) {
+ pr_err("Failed to add netlink callback\n");
+ ret = -EINVAL;
+ goto err4;
+ }
+ INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
return 0;
+err4:
+ destroy_workqueue(ib_nl_wq);
+err3:
+ mcast_cleanup();
err2:
ib_unregister_client(&sa_client);
err1:
@@ -1271,6 +1772,10 @@ err1:
static void __exit ib_sa_cleanup(void)
{
+ ibnl_remove_client(RDMA_NL_LS);
+ cancel_delayed_work(&ib_nl_timed_work);
+ flush_workqueue(ib_nl_wq);
+ destroy_workqueue(ib_nl_wq);
mcast_cleanup();
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
diff --git a/kernel/drivers/infiniband/core/smi.c b/kernel/drivers/infiniband/core/smi.c
index 5855e4405..f19b23817 100644
--- a/kernel/drivers/infiniband/core/smi.c
+++ b/kernel/drivers/infiniband/core/smi.c
@@ -5,6 +5,7 @@
* Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
* Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -38,85 +39,82 @@
#include <rdma/ib_smi.h>
#include "smi.h"
-
-/*
- * Fixup a directed route SMP for sending
- * Return 0 if the SMP should be discarded
- */
-enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
- u8 node_type, int port_num)
+#include "opa_smi.h"
+
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ const u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
{
- u8 hop_ptr, hop_cnt;
-
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
-
/* See section 14.2.2.2, Vol 1 IB spec */
/* C14-6 -- valid hop_cnt values are from 0 to 63 */
if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
return IB_SMI_DISCARD;
- if (!ib_get_smp_direction(smp)) {
+ if (!direction) {
/* C14-9:1 */
- if (hop_cnt && hop_ptr == 0) {
- smp->hop_ptr++;
- return (smp->initial_path[smp->hop_ptr] ==
+ if (hop_cnt && *hop_ptr == 0) {
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-9:2 */
- if (hop_ptr && hop_ptr < hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
return IB_SMI_DISCARD;
- /* smp->return_path set when received */
- smp->hop_ptr++;
- return (smp->initial_path[smp->hop_ptr] ==
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-9:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == hop_cnt) {
- /* smp->return_path set when received */
- smp->hop_ptr++;
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_dlid == IB_LID_PERMISSIVE ?
+ if (*hop_ptr == hop_cnt) {
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (is_switch ||
+ dr_dlid_is_permissive ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
/* C14-9:5 -- Fail unreasonable hop pointer */
- return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
} else {
/* C14-13:1 */
- if (hop_cnt && hop_ptr == hop_cnt + 1) {
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:2 */
- if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
return IB_SMI_DISCARD;
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:3 -- at the end of the DR segment of path */
- if (hop_ptr == 1) {
- smp->hop_ptr--;
+ if (*hop_ptr == 1) {
+ (*hop_ptr)--;
/* C14-13:3 -- SMPs destined for SM shouldn't be here */
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_slid == IB_LID_PERMISSIVE ?
+ return (is_switch ||
+ dr_slid_is_permissive ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
- if (hop_ptr == 0)
+ if (*hop_ptr == 0)
return IB_SMI_HANDLE;
/* C14-13:5 -- Check for unreasonable hop pointer */
@@ -125,105 +123,163 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
}
/*
- * Adjust information for a received SMP
- * Return 0 if the SMP should be dropped
+ * Fixup a directed route SMP for sending
+ * Return IB_SMI_DISCARD if the SMP should be discarded
*/
-enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
- int port_num, int phys_port_cnt)
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, int port_num)
{
- u8 hop_ptr, hop_cnt;
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
+enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
+ int phys_port_cnt,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
/* See section 14.2.2.2, Vol 1 IB spec */
/* C14-6 -- valid hop_cnt values are from 0 to 63 */
if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
return IB_SMI_DISCARD;
- if (!ib_get_smp_direction(smp)) {
+ if (!direction) {
/* C14-9:1 -- sender should have incremented hop_ptr */
- if (hop_cnt && hop_ptr == 0)
+ if (hop_cnt && *hop_ptr == 0)
return IB_SMI_DISCARD;
/* C14-9:2 -- intermediate hop */
- if (hop_ptr && hop_ptr < hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
return IB_SMI_DISCARD;
- smp->return_path[hop_ptr] = port_num;
- /* smp->hop_ptr updated when sending */
- return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+ return (initial_path[*hop_ptr+1] <= phys_port_cnt ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-9:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == hop_cnt) {
+ if (*hop_ptr == hop_cnt) {
if (hop_cnt)
- smp->return_path[hop_ptr] = port_num;
- /* smp->hop_ptr updated when sending */
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_dlid == IB_LID_PERMISSIVE ?
+ return (is_switch ||
+ dr_dlid_is_permissive ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
/* C14-9:5 -- fail unreasonable hop pointer */
- return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
} else {
/* C14-13:1 */
- if (hop_cnt && hop_ptr == hop_cnt + 1) {
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:2 */
- if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
return IB_SMI_DISCARD;
- /* smp->hop_ptr updated when sending */
- return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+ /* hop_ptr updated when sending */
+ return (return_path[*hop_ptr-1] <= phys_port_cnt ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == 1) {
- if (smp->dr_slid == IB_LID_PERMISSIVE) {
+ if (*hop_ptr == 1) {
+ if (dr_slid_is_permissive) {
/* giving SMP to SM - update hop_ptr */
- smp->hop_ptr--;
+ (*hop_ptr)--;
return IB_SMI_HANDLE;
}
- /* smp->hop_ptr updated when sending */
- return (node_type == RDMA_NODE_IB_SWITCH ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
+ /* hop_ptr updated when sending */
+ return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
/* C14-13:4 -- hop_ptr = 0 -> give to SM */
/* C14-13:5 -- Check for unreasonable hop pointer */
- return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
}
-enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
{
- u8 hop_ptr, hop_cnt;
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
- if (!ib_get_smp_direction(smp)) {
+static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ if (!direction) {
/* C14-9:2 -- intermediate hop */
if (hop_ptr && hop_ptr < hop_cnt)
return IB_SMI_FORWARD;
/* C14-9:3 -- at the end of the DR segment of path */
if (hop_ptr == hop_cnt)
- return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+ return (dr_dlid_is_permissive ?
IB_SMI_SEND : IB_SMI_LOCAL);
/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
@@ -236,10 +292,29 @@ enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
/* C14-13:3 -- at the end of the DR segment of path */
if (hop_ptr == 1)
- return (smp->dr_slid != IB_LID_PERMISSIVE ?
+ return (!dr_slid_is_permissive ?
IB_SMI_SEND : IB_SMI_LOCAL);
}
return IB_SMI_LOCAL;
+
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
}
/*
@@ -251,3 +326,13 @@ int smi_get_fwd_port(struct ib_smp *smp)
return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
smp->return_path[smp->hop_ptr-1]);
}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int opa_smi_get_fwd_port(struct opa_smp *smp)
+{
+ return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] :
+ smp->route.dr.return_path[smp->hop_ptr-1];
+}
diff --git a/kernel/drivers/infiniband/core/smi.h b/kernel/drivers/infiniband/core/smi.h
index aff96bac4..33c91c8a1 100644
--- a/kernel/drivers/infiniband/core/smi.h
+++ b/kernel/drivers/infiniband/core/smi.h
@@ -51,12 +51,12 @@ enum smi_forward_action {
IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
};
-enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
int port_num, int phys_port_cnt);
int smi_get_fwd_port(struct ib_smp *smp);
extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
- u8 node_type, int port_num);
+ bool is_switch, int port_num);
/*
* Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
diff --git a/kernel/drivers/infiniband/core/sysfs.c b/kernel/drivers/infiniband/core/sysfs.c
index cbd0383f6..b1f37d409 100644
--- a/kernel/drivers/infiniband/core/sysfs.c
+++ b/kernel/drivers/infiniband/core/sysfs.c
@@ -289,7 +289,7 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
union ib_gid gid;
ssize_t ret;
- ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL);
if (ret)
return ret;
@@ -326,6 +326,8 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
int width = (tab_attr->index >> 16) & 0xff;
struct ib_mad *in_mad = NULL;
struct ib_mad *out_mad = NULL;
+ size_t mad_size = sizeof(*out_mad);
+ u16 out_mad_pkey_index = 0;
ssize_t ret;
if (!p->ibdev->process_mad)
@@ -347,7 +349,10 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
in_mad->data[41] = p->port_num; /* PortSelect field */
if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
- p->port_num, NULL, NULL, in_mad, out_mad) &
+ p->port_num, NULL, NULL,
+ (const struct ib_mad_hdr *)in_mad, mad_size,
+ (struct ib_mad_hdr *)out_mad, &mad_size,
+ &out_mad_pkey_index) &
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
ret = -EINVAL;
@@ -452,28 +457,6 @@ static struct kobj_type port_type = {
.default_attrs = port_default_attrs
};
-static void ib_device_release(struct device *device)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
- struct kobj_uevent_env *env)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- if (add_uevent_var(env, "NAME=%s", dev->name))
- return -ENOMEM;
-
- /*
- * It would be nice to pass the node GUID with the event...
- */
-
- return 0;
-}
-
static struct attribute **
alloc_group_attrs(ssize_t (*show)(struct ib_port *,
struct port_attribute *, char *buf),
@@ -696,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
&dev_attr_node_desc
};
-static struct class ib_class = {
- .name = "infiniband",
- .dev_release = ib_device_release,
- .dev_uevent = ib_device_uevent,
-};
-
/* Show a given an attribute in the statistics group */
static ssize_t show_protocol_stat(const struct device *device,
struct device_attribute *attr, char *buf,
@@ -840,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
int ret;
int i;
- class_dev->class = &ib_class;
- class_dev->parent = device->dma_device;
- dev_set_name(class_dev, "%s", device->name);
- dev_set_drvdata(class_dev, device);
-
- INIT_LIST_HEAD(&device->port_list);
+ device->dev.parent = device->dma_device;
+ ret = dev_set_name(class_dev, "%s", device->name);
+ if (ret)
+ return ret;
- ret = device_register(class_dev);
+ ret = device_add(class_dev);
if (ret)
goto err;
@@ -864,7 +839,7 @@ int ib_device_register_sysfs(struct ib_device *device,
goto err_put;
}
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ if (rdma_cap_ib_switch(device)) {
ret = add_port(device, 0, port_callback);
if (ret)
goto err_put;
@@ -910,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
device_unregister(&device->dev);
}
-
-int ib_sysfs_setup(void)
-{
- return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
- class_unregister(&ib_class);
-}
diff --git a/kernel/drivers/infiniband/core/ucm.c b/kernel/drivers/infiniband/core/ucm.c
index f2f63933e..6b4e8a008 100644
--- a/kernel/drivers/infiniband/core/ucm.c
+++ b/kernel/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client ucm_client = {
.name = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
if (result)
goto out;
- result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
- NULL);
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
out:
ib_ucm_ctx_put(ctx);
return result;
@@ -1193,6 +1192,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
return 0;
}
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
static void ib_ucm_release_dev(struct device *dev)
{
struct ib_ucm_device *ucm_dev;
@@ -1202,7 +1202,7 @@ static void ib_ucm_release_dev(struct device *dev)
if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
clear_bit(ucm_dev->devnum, dev_map);
else
- clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
+ clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map);
kfree(ucm_dev);
}
@@ -1226,7 +1226,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
static dev_t overflow_maj;
-static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
static int find_overflow_devnum(void)
{
int ret;
@@ -1253,8 +1252,7 @@ static void ib_ucm_add_one(struct ib_device *device)
dev_t base;
struct ib_ucm_device *ucm_dev;
- if (!device->alloc_ucontext ||
- rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
return;
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
@@ -1311,9 +1309,9 @@ err:
return;
}
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+ struct ib_ucm_device *ucm_dev = client_data;
if (!ucm_dev)
return;
diff --git a/kernel/drivers/infiniband/core/ucma.c b/kernel/drivers/infiniband/core/ucma.c
index 45d67e922..8b5a934e1 100644
--- a/kernel/drivers/infiniband/core/ucma.c
+++ b/kernel/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
#include <rdma/rdma_user_cm.h>
#include <rdma/ib_marshall.h>
@@ -74,6 +75,7 @@ struct ucma_file {
struct list_head ctx_list;
struct list_head event_list;
wait_queue_head_t poll_wait;
+ struct workqueue_struct *close_wq;
};
struct ucma_context {
@@ -89,6 +91,13 @@ struct ucma_context {
struct list_head list;
struct list_head mc_list;
+ /* mark that device is in process of destroying the internal HW
+ * resources, protected by the global mut
+ */
+ int closing;
+ /* sync between removal event and id destroy, protected by file mut */
+ int destroying;
+ struct work_struct close_work;
};
struct ucma_multicast {
@@ -107,6 +116,7 @@ struct ucma_event {
struct list_head list;
struct rdma_cm_id *cm_id;
struct rdma_ucm_event_resp resp;
+ struct work_struct close_work;
};
static DEFINE_MUTEX(mut);
@@ -132,8 +142,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
mutex_lock(&mut);
ctx = _ucma_find_context(id, file);
- if (!IS_ERR(ctx))
- atomic_inc(&ctx->ref);
+ if (!IS_ERR(ctx)) {
+ if (ctx->closing)
+ ctx = ERR_PTR(-EIO);
+ else
+ atomic_inc(&ctx->ref);
+ }
mutex_unlock(&mut);
return ctx;
}
@@ -144,6 +158,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
complete(&ctx->comp);
}
+static void ucma_close_event_id(struct work_struct *work)
+{
+ struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
+
+ rdma_destroy_id(uevent_close->cm_id);
+ kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+}
+
static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
{
struct ucma_context *ctx;
@@ -152,6 +188,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
if (!ctx)
return NULL;
+ INIT_WORK(&ctx->close_work, ucma_close_id);
atomic_set(&ctx->ref, 1);
init_completion(&ctx->comp);
INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +279,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
}
}
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+ struct ucma_context *ctx = cm_id->context;
+ struct ucma_event *con_req_eve;
+ int event_found = 0;
+
+ if (ctx->destroying)
+ return;
+
+ /* only if context is pointing to cm_id that it owns it and can be
+ * queued to be closed, otherwise that cm_id is an inflight one that
+ * is part of that context event list pending to be detached and
+ * reattached to its new context as part of ucma_get_event,
+ * handled separately below.
+ */
+ if (ctx->cm_id == cm_id) {
+ mutex_lock(&mut);
+ ctx->closing = 1;
+ mutex_unlock(&mut);
+ queue_work(ctx->file->close_wq, &ctx->close_work);
+ return;
+ }
+
+ list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+ if (con_req_eve->cm_id == cm_id &&
+ con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ list_del(&con_req_eve->list);
+ INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+ queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+ event_found = 1;
+ break;
+ }
+ }
+ if (!event_found)
+ printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
static int ucma_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -276,14 +351,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
* We ignore events for new connections until userspace has set
* their context. This can only happen if an error occurs on a
* new connection before the user accepts it. This is okay,
- * since the accept will just fail later.
+ * since the accept will just fail later. However, we do need
+ * to release the underlying HW resources in case of a device
+ * removal event.
*/
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+
kfree(uevent);
goto out;
}
list_add_tail(&uevent->list, &ctx->file->event_list);
wake_up_interruptible(&ctx->file->poll_wait);
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
out:
mutex_unlock(&ctx->file->mut);
return ret;
@@ -391,7 +473,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
return -ENOMEM;
ctx->uid = cmd.uid;
- ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+ ctx->cm_id = rdma_create_id(current->nsproxy->net_ns,
+ ucma_event_handler, ctx, cmd.ps, qp_type);
if (IS_ERR(ctx->cm_id)) {
ret = PTR_ERR(ctx->cm_id);
goto err1;
@@ -442,9 +525,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
}
/*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock. We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing. We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
*/
static int ucma_free_ctx(struct ucma_context *ctx)
{
@@ -452,8 +541,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
struct ucma_event *uevent, *tmp;
LIST_HEAD(list);
- /* No new events will be generated after destroying the id. */
- rdma_destroy_id(ctx->cm_id);
ucma_cleanup_multicast(ctx);
@@ -501,10 +588,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ucma_put_ctx(ctx);
- wait_for_completion(&ctx->comp);
- resp.events_reported = ucma_free_ctx(ctx);
+ mutex_lock(&ctx->file->mut);
+ ctx->destroying = 1;
+ mutex_unlock(&ctx->file->mut);
+ flush_workqueue(ctx->file->close_wq);
+ /* At this point it's guaranteed that there is no inflight
+ * closing task */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ resp.events_reported = ucma_free_ctx(ctx);
if (copy_to_user((void __user *)(unsigned long)cmd.response,
&resp, sizeof(resp)))
ret = -EFAULT;
@@ -722,26 +823,13 @@ static ssize_t ucma_query_route(struct ucma_file *file,
resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
resp.port_num = ctx->cm_id->port_num;
- switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(ctx->cm_id->device,
- ctx->cm_id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ucma_copy_ib_route(&resp, &ctx->cm_id->route);
- break;
- case IB_LINK_LAYER_ETHERNET:
- ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
- break;
- default:
- break;
- }
- break;
- case RDMA_TRANSPORT_IWARP:
+
+ if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
ucma_copy_iw_route(&resp, &ctx->cm_id->route);
- break;
- default:
- break;
- }
out:
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -1125,7 +1213,6 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
return -EINVAL;
memset(&sa_path, 0, sizeof(sa_path));
- sa_path.vlan_id = 0xffff;
ib_sa_unpack_path(path_data->path_rec, &sa_path);
ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
@@ -1334,10 +1421,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
mc = ERR_PTR(-ENOENT);
else if (mc->ctx->file != file)
mc = ERR_PTR(-EINVAL);
- else {
+ else if (!atomic_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+ else
idr_remove(&multicast_idr, mc->id);
- atomic_inc(&mc->ctx->ref);
- }
mutex_unlock(&mut);
if (IS_ERR(mc)) {
@@ -1367,10 +1454,10 @@ static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
/* Acquire mutex's based on pointer comparison to prevent deadlock. */
if (file1 < file2) {
mutex_lock(&file1->mut);
- mutex_lock(&file2->mut);
+ mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&file2->mut);
- mutex_lock(&file1->mut);
+ mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);
}
}
@@ -1538,6 +1625,12 @@ static int ucma_open(struct inode *inode, struct file *filp)
if (!file)
return -ENOMEM;
+ file->close_wq = create_singlethread_workqueue("ucma_close_id");
+ if (!file->close_wq) {
+ kfree(file);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&file->event_list);
INIT_LIST_HEAD(&file->ctx_list);
init_waitqueue_head(&file->poll_wait);
@@ -1556,16 +1649,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
mutex_lock(&file->mut);
list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ ctx->destroying = 1;
mutex_unlock(&file->mut);
mutex_lock(&mut);
idr_remove(&ctx_idr, ctx->id);
mutex_unlock(&mut);
+ flush_workqueue(file->close_wq);
+ /* At that step once ctx was marked as destroying and workqueue
+ * was flushed we are safe from any inflights handlers that
+ * might put other closing task.
+ */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ /* rdma_destroy_id ensures that no event handlers are
+ * inflight for that id before releasing it.
+ */
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
ucma_free_ctx(ctx);
mutex_lock(&file->mut);
}
mutex_unlock(&file->mut);
+ destroy_workqueue(file->close_wq);
kfree(file);
return 0;
}
@@ -1629,6 +1740,7 @@ static void __exit ucma_cleanup(void)
device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
misc_deregister(&ucma_misc);
idr_destroy(&ctx_idr);
+ idr_destroy(&multicast_idr);
}
module_init(ucma_init);
diff --git a/kernel/drivers/infiniband/core/user_mad.c b/kernel/drivers/infiniband/core/user_mad.c
index 928cdd20e..57f281f8d 100644
--- a/kernel/drivers/infiniband/core/user_mad.c
+++ b/kernel/drivers/infiniband/core/user_mad.c
@@ -99,7 +99,6 @@ struct ib_umad_port {
};
struct ib_umad_device {
- int start_port, end_port;
struct kobject kobj;
struct ib_umad_port port[0];
};
@@ -134,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
static void ib_umad_release_dev(struct kobject *kobj)
{
@@ -263,20 +262,23 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
{
struct ib_mad_recv_buf *recv_buf;
int left, seg_payload, offset, max_seg_payload;
+ size_t seg_size;
- /* We need enough room to copy the first (or only) MAD segment. */
recv_buf = &packet->recv_wc->recv_buf;
- if ((packet->length <= sizeof (*recv_buf->mad) &&
+ seg_size = packet->recv_wc->mad_seg_size;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ if ((packet->length <= seg_size &&
count < hdr_size(file) + packet->length) ||
- (packet->length > sizeof (*recv_buf->mad) &&
- count < hdr_size(file) + sizeof (*recv_buf->mad)))
+ (packet->length > seg_size &&
+ count < hdr_size(file) + seg_size))
return -EINVAL;
if (copy_to_user(buf, &packet->mad, hdr_size(file)))
return -EFAULT;
buf += hdr_size(file);
- seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+ seg_payload = min_t(int, packet->length, seg_size);
if (copy_to_user(buf, recv_buf->mad, seg_payload))
return -EFAULT;
@@ -293,7 +295,7 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
return -ENOSPC;
}
offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
- max_seg_payload = sizeof (struct ib_mad) - offset;
+ max_seg_payload = seg_size - offset;
for (left = packet->length - seg_payload, buf += seg_payload;
left; left -= seg_payload, buf += seg_payload) {
@@ -426,11 +428,11 @@ static int is_duplicate(struct ib_umad_file *file,
* the same TID, reject the second as a duplicate. This is more
* restrictive than required by the spec.
*/
- if (!ib_response_mad((struct ib_mad *) hdr)) {
- if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ if (!ib_response_mad(hdr)) {
+ if (!ib_response_mad(sent_hdr))
return 1;
continue;
- } else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ } else if (!ib_response_mad(sent_hdr))
continue;
if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
@@ -451,6 +453,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
struct ib_rmpp_mad *rmpp_mad;
__be64 *tid;
int ret, data_len, hdr_len, copy_offset, rmpp_active;
+ u8 base_version;
if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
return -EINVAL;
@@ -517,11 +520,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_active = 0;
}
+ base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
data_len = count - hdr_size(file) - hdr_len;
packet->msg = ib_create_send_mad(agent,
be32_to_cpu(packet->mad.hdr.qpn),
packet->mad.hdr.pkey_index, rmpp_active,
- hdr_len, data_len, GFP_KERNEL);
+ hdr_len, data_len, GFP_KERNEL,
+ base_version);
if (IS_ERR(packet->msg)) {
ret = PTR_ERR(packet->msg);
goto err_ah;
@@ -1273,16 +1278,10 @@ static void ib_umad_add_one(struct ib_device *device)
{
struct ib_umad_device *umad_dev;
int s, e, i;
+ int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- s = e = 0;
- else {
- s = 1;
- e = device->phys_port_cnt;
- }
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
umad_dev = kzalloc(sizeof *umad_dev +
(e - s + 1) * sizeof (struct ib_umad_port),
@@ -1292,38 +1291,49 @@ static void ib_umad_add_one(struct ib_device *device)
kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
- umad_dev->start_port = s;
- umad_dev->end_port = e;
-
for (i = s; i <= e; ++i) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
umad_dev->port[i - s].umad_dev = umad_dev;
if (ib_umad_init_port(device, i, umad_dev,
&umad_dev->port[i - s]))
goto err;
+
+ count++;
}
+ if (!count)
+ goto free;
+
ib_set_client_data(device, &umad_client, umad_dev);
return;
err:
- while (--i >= s)
- ib_umad_kill_port(&umad_dev->port[i - s]);
+ while (--i >= s) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+ ib_umad_kill_port(&umad_dev->port[i - s]);
+ }
+free:
kobject_put(&umad_dev->kobj);
}
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ struct ib_umad_device *umad_dev = client_data;
int i;
if (!umad_dev)
return;
- for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
- ib_umad_kill_port(&umad_dev->port[i]);
+ for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
+ if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
+ ib_umad_kill_port(&umad_dev->port[i]);
+ }
kobject_put(&umad_dev->kobj);
}
diff --git a/kernel/drivers/infiniband/core/uverbs.h b/kernel/drivers/infiniband/core/uverbs.h
index bebf11a66..94bbd8c15 100644
--- a/kernel/drivers/infiniband/core/uverbs.h
+++ b/kernel/drivers/infiniband/core/uverbs.h
@@ -89,12 +89,16 @@ struct ib_uverbs_device {
int num_comp_vectors;
struct completion comp;
struct device *dev;
- struct ib_device *ib_dev;
+ struct ib_device __rcu *ib_dev;
int devnum;
struct cdev cdev;
struct rb_root xrcd_tree;
struct mutex xrcd_tree_mutex;
struct kobject kobj;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct list_head uverbs_events_file_list;
};
struct ib_uverbs_event_file {
@@ -106,6 +110,7 @@ struct ib_uverbs_event_file {
wait_queue_head_t poll_wait;
struct fasync_struct *async_queue;
struct list_head event_list;
+ struct list_head list;
};
struct ib_uverbs_file {
@@ -115,6 +120,8 @@ struct ib_uverbs_file {
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
struct ib_uverbs_event_file *async_file;
+ struct list_head list;
+ int is_closed;
};
struct ib_uverbs_event {
@@ -178,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -213,6 +222,7 @@ struct ib_uverbs_flow_spec {
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
const char __user *buf, int in_len, \
int out_len)
@@ -254,11 +264,14 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
#define IB_UVERBS_DECLARE_EX_CMD(name) \
int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
struct ib_udata *ucore, \
struct ib_udata *uhw)
IB_UVERBS_DECLARE_EX_CMD(create_flow);
IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
+IB_UVERBS_DECLARE_EX_CMD(create_qp);
#endif /* UVERBS_H */
diff --git a/kernel/drivers/infiniband/core/uverbs_cmd.c b/kernel/drivers/infiniband/core/uverbs_cmd.c
index ccc2494b4..1c02deab0 100644
--- a/kernel/drivers/infiniband/core/uverbs_cmd.c
+++ b/kernel/drivers/infiniband/core/uverbs_cmd.c
@@ -62,9 +62,11 @@ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
* The ib_uobject locking scheme is as follows:
*
* - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
- * needs to be held during all idr operations. When an object is
+ * needs to be held during all idr write operations. When an object is
* looked up, a reference must be taken on the object's kref before
- * dropping this lock.
+ * dropping this lock. For read operations, the rcu_read_lock()
+ * and rcu_write_lock() but similarly the kref reference is grabbed
+ * before the rcu_read_unlock().
*
* - Each object also has an rwsem. This rwsem must be held for
* reading while an operation that uses the object is performed.
@@ -96,7 +98,7 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
static void release_uobj(struct kref *kref)
{
- kfree(container_of(kref, struct ib_uobject, ref));
+ kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu);
}
static void put_uobj(struct ib_uobject *uobj)
@@ -145,7 +147,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
{
struct ib_uobject *uobj;
- spin_lock(&ib_uverbs_idr_lock);
+ rcu_read_lock();
uobj = idr_find(idr, id);
if (uobj) {
if (uobj->context == context)
@@ -153,7 +155,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
else
uobj = NULL;
}
- spin_unlock(&ib_uverbs_idr_lock);
+ rcu_read_unlock();
return uobj;
}
@@ -282,13 +284,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
}
ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
- struct ib_device *ibdev = file->device->ib_dev;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
#endif
@@ -313,13 +315,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto err;
}
- ucontext->device = ibdev;
+ ucontext->device = ib_dev;
INIT_LIST_HEAD(&ucontext->pd_list);
INIT_LIST_HEAD(&ucontext->mr_list);
INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +342,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
- ret = ib_query_device(ibdev, &dev_attr);
+ ret = ib_query_device(ib_dev, &dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +357,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_free;
resp.async_fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 1);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
@@ -367,16 +369,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_file;
}
- file->async_file = filp->private_data;
-
- INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
- ib_uverbs_event_handler);
- ret = ib_register_event_handler(&file->event_handler);
- if (ret)
- goto err_file;
-
- kref_get(&file->async_file->ref);
- kref_get(&file->ref);
file->ucontext = ucontext;
fd_install(resp.async_fd, filp);
@@ -386,6 +378,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
err_file:
+ ib_uverbs_free_async_event_file(file);
fput(filp);
err_fd:
@@ -393,7 +386,7 @@ err_fd:
err_free:
put_pid(ucontext->tgid);
- ibdev->dealloc_ucontext(ucontext);
+ ib_dev->dealloc_ucontext(ucontext);
err:
mutex_unlock(&file->mutex);
@@ -401,11 +394,12 @@ err:
}
static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_query_device_resp *resp,
struct ib_device_attr *attr)
{
resp->fw_ver = attr->fw_ver;
- resp->node_guid = file->device->ib_dev->node_guid;
+ resp->node_guid = ib_dev->node_guid;
resp->sys_image_guid = attr->sys_image_guid;
resp->max_mr_size = attr->max_mr_size;
resp->page_size_cap = attr->page_size_cap;
@@ -443,10 +437,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
resp->max_srq_sge = attr->max_srq_sge;
resp->max_pkeys = attr->max_pkeys;
resp->local_ca_ack_delay = attr->local_ca_ack_delay;
- resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+ resp->phys_port_cnt = ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -461,12 +456,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_device(file->device->ib_dev, &attr);
+ ret = ib_query_device(ib_dev, &attr);
if (ret)
return ret;
memset(&resp, 0, sizeof resp);
- copy_query_dev_fields(file, &resp, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp, &attr);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -476,6 +471,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -490,7 +486,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
if (ret)
return ret;
@@ -515,7 +511,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width = attr.active_width;
resp.active_speed = attr.active_speed;
resp.phys_state = attr.phys_state;
- resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ resp.link_layer = rdma_port_get_link_layer(ib_dev,
cmd.port_num);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +522,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -553,15 +550,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
down_write(&uobj->mutex);
- pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
- file->ucontext, &udata);
+ pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
if (IS_ERR(pd)) {
ret = PTR_ERR(pd);
goto err;
}
- pd->device = file->device->ib_dev;
+ pd->device = ib_dev;
pd->uobject = uobj;
+ pd->local_mr = NULL;
atomic_set(&pd->usecnt, 0);
uobj->object = pd;
@@ -600,11 +597,13 @@ err:
}
ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_dealloc_pd cmd;
struct ib_uobject *uobj;
+ struct ib_pd *pd;
int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +612,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
if (!uobj)
return -EINVAL;
+ pd = uobj->object;
- ret = ib_dealloc_pd(uobj->object);
- if (!ret)
- uobj->live = 0;
-
- put_uobj_write(uobj);
+ if (atomic_read(&pd->usecnt)) {
+ ret = -EBUSY;
+ goto err_put;
+ }
+ ret = pd->device->dealloc_pd(uobj->object);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
if (ret)
- return ret;
+ goto err_put;
+
+ uobj->live = 0;
+ put_uobj_write(uobj);
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
@@ -632,6 +636,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
put_uobj(uobj);
return in_len;
+
+err_put:
+ put_uobj_write(uobj);
+ return ret;
}
struct xrcd_table_entry {
@@ -720,6 +728,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -778,15 +787,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
down_write(&obj->uobject.mutex);
if (!xrcd) {
- xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
- file->ucontext, &udata);
+ xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
}
xrcd->inode = inode;
- xrcd->device = file->device->ib_dev;
+ xrcd->device = ib_dev;
atomic_set(&xrcd->usecnt, 0);
mutex_init(&xrcd->tgt_qp_mutex);
INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +865,7 @@ err_tree_mutex_unlock:
}
ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -934,6 +943,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1043,6 +1053,7 @@ err_free:
}
ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1136,6 +1147,7 @@ put_uobjs:
}
ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1174,8 +1186,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_alloc_mw cmd;
struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1269,9 @@ err_free:
}
ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_dealloc_mw cmd;
struct ib_mw *mw;
@@ -1294,6 +1308,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1313,7 +1328,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return ret;
resp.fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 0);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
if (IS_ERR(filp)) {
put_unused_fd(resp.fd);
return PTR_ERR(filp);
@@ -1330,40 +1345,38 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return in_len;
}
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_cq *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *udata,
+ void *context),
+ void *context)
{
- struct ib_uverbs_create_cq cmd;
- struct ib_uverbs_create_cq_resp resp;
- struct ib_udata udata;
struct ib_ucq_object *obj;
struct ib_uverbs_event_file *ev_file = NULL;
struct ib_cq *cq;
int ret;
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_cq_init_attr attr = {};
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
-
- if (cmd.comp_vector >= file->device->num_comp_vectors)
- return -EINVAL;
+ if (cmd->comp_vector >= file->device->num_comp_vectors)
+ return ERR_PTR(-EINVAL);
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class);
+ init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class);
down_write(&obj->uobject.mutex);
- if (cmd.comp_channel >= 0) {
- ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+ if (cmd->comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel);
if (!ev_file) {
ret = -EINVAL;
goto err;
@@ -1376,15 +1389,20 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
- cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
- cmd.comp_vector,
- file->ucontext, &udata);
+ attr.cqe = cmd->cqe;
+ attr.comp_vector = cmd->comp_vector;
+
+ if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+ attr.flags = cmd->flags;
+
+ cq = ib_dev->create_cq(ib_dev, &attr,
+ file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
}
- cq->device = file->device->ib_dev;
+ cq->device = ib_dev;
cq->uobject = &obj->uobject;
cq->comp_handler = ib_uverbs_comp_handler;
cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1397,14 +1415,15 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
goto err_free;
memset(&resp, 0, sizeof resp);
- resp.cq_handle = obj->uobject.id;
- resp.cqe = cq->cqe;
+ resp.base.cq_handle = obj->uobject.id;
+ resp.base.cqe = cq->cqe;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_copy;
- }
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ ret = cb(file, obj, &resp, ucore, context);
+ if (ret)
+ goto err_cb;
mutex_lock(&file->mutex);
list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
@@ -1414,9 +1433,9 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
up_write(&obj->uobject.mutex);
- return in_len;
+ return obj;
-err_copy:
+err_cb:
idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
err_free:
@@ -1428,10 +1447,112 @@ err_file:
err:
put_uobj_write(&obj->uobject);
- return ret;
+
+ return ERR_PTR(ret);
+}
+
+static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_ex_create_cq cmd_ex;
+ struct ib_uverbs_create_cq_resp resp;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ struct ib_ucq_object *obj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), sizeof(resp));
+
+ INIT_UDATA(&uhw, buf + sizeof(cmd),
+ (unsigned long)cmd.response + sizeof(resp),
+ in_len - sizeof(cmd), out_len - sizeof(resp));
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.cqe = cmd.cqe;
+ cmd_ex.comp_vector = cmd.comp_vector;
+ cmd_ex.comp_channel = cmd.comp_channel;
+
+ obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), comp_channel) +
+ sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
+ NULL);
+
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_uverbs_ex_create_cq cmd;
+ struct ib_ucq_object *obj;
+ int err;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_cq_cb, NULL);
+
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ return 0;
}
ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1495,6 +1616,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
}
ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1546,6 +1668,7 @@ out_put:
}
ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1568,6 +1691,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1619,65 +1743,65 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
return in_len;
}
-ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- struct ib_uverbs_create_qp cmd;
- struct ib_uverbs_create_qp_resp resp;
- struct ib_udata udata;
- struct ib_uqp_object *obj;
- struct ib_device *device;
- struct ib_pd *pd = NULL;
- struct ib_xrcd *xrcd = NULL;
- struct ib_uobject *uninitialized_var(xrcd_uobj);
- struct ib_cq *scq = NULL, *rcq = NULL;
- struct ib_srq *srq = NULL;
- struct ib_qp *qp;
- struct ib_qp_init_attr attr;
- int ret;
-
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+static int create_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_qp *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *udata),
+ void *context)
+{
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ char *buf;
+ struct ib_qp_init_attr attr;
+ struct ib_uverbs_ex_create_qp_resp resp;
+ int ret;
- if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
return -EPERM;
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
-
obj = kzalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
return -ENOMEM;
- init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext,
+ &qp_lock_class);
down_write(&obj->uevent.uobject.mutex);
- if (cmd.qp_type == IB_QPT_XRC_TGT) {
- xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+ if (cmd->qp_type == IB_QPT_XRC_TGT) {
+ xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext,
+ &xrcd_uobj);
if (!xrcd) {
ret = -EINVAL;
goto err_put;
}
device = xrcd->device;
} else {
- if (cmd.qp_type == IB_QPT_XRC_INI) {
- cmd.max_recv_wr = cmd.max_recv_sge = 0;
+ if (cmd->qp_type == IB_QPT_XRC_INI) {
+ cmd->max_recv_wr = 0;
+ cmd->max_recv_sge = 0;
} else {
- if (cmd.is_srq) {
- srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (cmd->is_srq) {
+ srq = idr_read_srq(cmd->srq_handle,
+ file->ucontext);
if (!srq || srq->srq_type != IB_SRQT_BASIC) {
ret = -EINVAL;
goto err_put;
}
}
- if (cmd.recv_cq_handle != cmd.send_cq_handle) {
- rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0);
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = idr_read_cq(cmd->recv_cq_handle,
+ file->ucontext, 0);
if (!rcq) {
ret = -EINVAL;
goto err_put;
@@ -1685,9 +1809,9 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
}
}
- scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq);
+ scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
rcq = rcq ?: scq;
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
if (!pd || !scq) {
ret = -EINVAL;
goto err_put;
@@ -1702,31 +1826,49 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
attr.recv_cq = rcq;
attr.srq = srq;
attr.xrcd = xrcd;
- attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
- attr.qp_type = cmd.qp_type;
+ attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
+ IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd->qp_type;
attr.create_flags = 0;
- attr.cap.max_send_wr = cmd.max_send_wr;
- attr.cap.max_recv_wr = cmd.max_recv_wr;
- attr.cap.max_send_sge = cmd.max_send_sge;
- attr.cap.max_recv_sge = cmd.max_recv_sge;
- attr.cap.max_inline_data = cmd.max_inline_data;
+ attr.cap.max_send_wr = cmd->max_send_wr;
+ attr.cap.max_recv_wr = cmd->max_recv_wr;
+ attr.cap.max_send_sge = cmd->max_send_sge;
+ attr.cap.max_recv_sge = cmd->max_recv_sge;
+ attr.cap.max_inline_data = cmd->max_inline_data;
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
INIT_LIST_HEAD(&obj->mcast_list);
- if (cmd.qp_type == IB_QPT_XRC_TGT)
+ if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
+ sizeof(cmd->create_flags))
+ attr.create_flags = cmd->create_flags;
+
+ if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ buf = (void *)cmd + sizeof(*cmd);
+ if (cmd_sz > sizeof(*cmd))
+ if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
+ cmd_sz - sizeof(*cmd) - 1))) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (cmd->qp_type == IB_QPT_XRC_TGT)
qp = ib_create_qp(pd, &attr);
else
- qp = device->create_qp(pd, &attr, &udata);
+ qp = device->create_qp(pd, &attr, uhw);
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
goto err_put;
}
- if (cmd.qp_type != IB_QPT_XRC_TGT) {
+ if (cmd->qp_type != IB_QPT_XRC_TGT) {
qp->real_qp = qp;
qp->device = device;
qp->pd = pd;
@@ -1752,19 +1894,20 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
goto err_destroy;
memset(&resp, 0, sizeof resp);
- resp.qpn = qp->qp_num;
- resp.qp_handle = obj->uevent.uobject.id;
- resp.max_recv_sge = attr.cap.max_recv_sge;
- resp.max_send_sge = attr.cap.max_send_sge;
- resp.max_recv_wr = attr.cap.max_recv_wr;
- resp.max_send_wr = attr.cap.max_send_wr;
- resp.max_inline_data = attr.cap.max_inline_data;
-
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_copy;
- }
+ resp.base.qpn = qp->qp_num;
+ resp.base.qp_handle = obj->uevent.uobject.id;
+ resp.base.max_recv_sge = attr.cap.max_recv_sge;
+ resp.base.max_send_sge = attr.cap.max_send_sge;
+ resp.base.max_recv_wr = attr.cap.max_recv_wr;
+ resp.base.max_send_wr = attr.cap.max_send_wr;
+ resp.base.max_inline_data = attr.cap.max_inline_data;
+
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ ret = cb(file, &resp, ucore);
+ if (ret)
+ goto err_cb;
if (xrcd) {
obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
@@ -1790,9 +1933,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
up_write(&obj->uevent.uobject.mutex);
- return in_len;
-
-err_copy:
+ return 0;
+err_cb:
idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
err_destroy:
@@ -1814,7 +1956,115 @@ err_put:
return ret;
}
+static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_ex_create_qp cmd_ex;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
+ int err;
+
+ if (out_len < resp_size)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd),
+ resp_size);
+ INIT_UDATA(&uhw, buf + sizeof(cmd),
+ (unsigned long)cmd.response + resp_size,
+ in_len - sizeof(cmd), out_len - resp_size);
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.pd_handle = cmd.pd_handle;
+ cmd_ex.send_cq_handle = cmd.send_cq_handle;
+ cmd_ex.recv_cq_handle = cmd.recv_cq_handle;
+ cmd_ex.srq_handle = cmd.srq_handle;
+ cmd_ex.max_send_wr = cmd.max_send_wr;
+ cmd_ex.max_recv_wr = cmd.max_recv_wr;
+ cmd_ex.max_send_sge = cmd.max_send_sge;
+ cmd_ex.max_recv_sge = cmd.max_recv_sge;
+ cmd_ex.max_inline_data = cmd.max_inline_data;
+ cmd_ex.sq_sig_all = cmd.sq_sig_all;
+ cmd_ex.qp_type = cmd.qp_type;
+ cmd_ex.is_srq = cmd.is_srq;
+
+ err = create_qp(file, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), is_srq) +
+ sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
+ NULL);
+
+ if (err)
+ return err;
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_qp_resp resp;
+ struct ib_uverbs_ex_create_qp cmd = {0};
+ int err;
+
+ if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
+ sizeof(cmd.comp_mask)))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ err = create_qp(file, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_qp_cb, NULL);
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_open_qp cmd;
@@ -1909,6 +2159,7 @@ err_put:
}
ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2023,6 +2274,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
}
ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2095,7 +2347,7 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
if (qp->real_qp == qp) {
- ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+ ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask);
if (ret)
goto release_qp;
ret = qp->device->modify_qp(qp, attr,
@@ -2119,6 +2371,7 @@ out:
}
ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2176,7 +2429,14 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
return in_len;
}
+static void *alloc_wr(size_t wr_size, __u32 num_sge)
+{
+ return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) +
+ num_sge * sizeof (struct ib_sge), GFP_KERNEL);
+};
+
ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2188,6 +2448,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
int i, sg_ind;
int is_ud;
ssize_t ret = -EINVAL;
+ size_t next_size;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -2223,14 +2484,87 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
goto out_put;
}
- next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
- user_wr->num_sge * sizeof (struct ib_sge),
- GFP_KERNEL);
- if (!next) {
- ret = -ENOMEM;
+ if (is_ud) {
+ struct ib_ud_wr *ud;
+
+ if (user_wr->opcode != IB_WR_SEND &&
+ user_wr->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next_size = sizeof(*ud);
+ ud = alloc_wr(next_size, user_wr->num_sge);
+ if (!ud) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext);
+ if (!ud->ah) {
+ kfree(ud);
+ ret = -EINVAL;
+ goto out_put;
+ }
+ ud->remote_qpn = user_wr->wr.ud.remote_qpn;
+ ud->remote_qkey = user_wr->wr.ud.remote_qkey;
+
+ next = &ud->wr;
+ } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE ||
+ user_wr->opcode == IB_WR_RDMA_READ) {
+ struct ib_rdma_wr *rdma;
+
+ next_size = sizeof(*rdma);
+ rdma = alloc_wr(next_size, user_wr->num_sge);
+ if (!rdma) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ rdma->remote_addr = user_wr->wr.rdma.remote_addr;
+ rdma->rkey = user_wr->wr.rdma.rkey;
+
+ next = &rdma->wr;
+ } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ struct ib_atomic_wr *atomic;
+
+ next_size = sizeof(*atomic);
+ atomic = alloc_wr(next_size, user_wr->num_sge);
+ if (!atomic) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ atomic->remote_addr = user_wr->wr.atomic.remote_addr;
+ atomic->compare_add = user_wr->wr.atomic.compare_add;
+ atomic->swap = user_wr->wr.atomic.swap;
+ atomic->rkey = user_wr->wr.atomic.rkey;
+
+ next = &atomic->wr;
+ } else if (user_wr->opcode == IB_WR_SEND ||
+ user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next_size = sizeof(*next);
+ next = alloc_wr(next_size, user_wr->num_sge);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ } else {
+ ret = -EINVAL;
goto out_put;
}
+ if (user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey;
+ }
+
if (!last)
wr = next;
else
@@ -2243,63 +2577,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next->opcode = user_wr->opcode;
next->send_flags = user_wr->send_flags;
- if (is_ud) {
- if (next->opcode != IB_WR_SEND &&
- next->opcode != IB_WR_SEND_WITH_IMM) {
- ret = -EINVAL;
- goto out_put;
- }
-
- next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
- file->ucontext);
- if (!next->wr.ud.ah) {
- ret = -EINVAL;
- goto out_put;
- }
- next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn;
- next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
- if (next->opcode == IB_WR_SEND_WITH_IMM)
- next->ex.imm_data =
- (__be32 __force) user_wr->ex.imm_data;
- } else {
- switch (next->opcode) {
- case IB_WR_RDMA_WRITE_WITH_IMM:
- next->ex.imm_data =
- (__be32 __force) user_wr->ex.imm_data;
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- next->wr.rdma.remote_addr =
- user_wr->wr.rdma.remote_addr;
- next->wr.rdma.rkey =
- user_wr->wr.rdma.rkey;
- break;
- case IB_WR_SEND_WITH_IMM:
- next->ex.imm_data =
- (__be32 __force) user_wr->ex.imm_data;
- break;
- case IB_WR_SEND_WITH_INV:
- next->ex.invalidate_rkey =
- user_wr->ex.invalidate_rkey;
- break;
- case IB_WR_ATOMIC_CMP_AND_SWP:
- case IB_WR_ATOMIC_FETCH_AND_ADD:
- next->wr.atomic.remote_addr =
- user_wr->wr.atomic.remote_addr;
- next->wr.atomic.compare_add =
- user_wr->wr.atomic.compare_add;
- next->wr.atomic.swap = user_wr->wr.atomic.swap;
- next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
- case IB_WR_SEND:
- break;
- default:
- ret = -EINVAL;
- goto out_put;
- }
- }
-
if (next->num_sge) {
next->sg_list = (void *) next +
- ALIGN(sizeof *next, sizeof (struct ib_sge));
+ ALIGN(next_size, sizeof(struct ib_sge));
if (copy_from_user(next->sg_list,
buf + sizeof cmd +
cmd.wr_count * cmd.wqe_size +
@@ -2330,8 +2610,8 @@ out_put:
put_qp_read(qp);
while (wr) {
- if (is_ud && wr->wr.ud.ah)
- put_ah_read(wr->wr.ud.ah);
+ if (is_ud && ud_wr(wr)->ah)
+ put_ah_read(ud_wr(wr)->ah);
next = wr->next;
kfree(wr);
wr = next;
@@ -2429,6 +2709,7 @@ err:
}
ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2478,6 +2759,7 @@ out:
}
ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2527,6 +2809,7 @@ out:
}
ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2567,7 +2850,6 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
attr.grh.sgid_index = cmd.attr.grh.sgid_index;
attr.grh.hop_limit = cmd.attr.grh.hop_limit;
attr.grh.traffic_class = cmd.attr.grh.traffic_class;
- attr.vlan_id = 0;
memset(&attr.dmac, 0, sizeof(attr.dmac));
memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
@@ -2619,6 +2901,7 @@ err:
}
ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_destroy_ah cmd;
@@ -2655,6 +2938,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2702,6 +2986,7 @@ out_put:
}
ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2782,6 +3067,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
}
int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -2942,6 +3228,7 @@ err_free_attr:
}
int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -2984,6 +3271,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
}
static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_create_xsrq *cmd,
struct ib_udata *udata)
{
@@ -3117,6 +3405,7 @@ err:
}
ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3144,7 +3433,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
if (ret)
return ret;
@@ -3152,6 +3441,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_create_xsrq cmd;
@@ -3169,7 +3459,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
if (ret)
return ret;
@@ -3177,6 +3467,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3207,6 +3498,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -3247,6 +3539,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3304,16 +3597,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
}
int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_ex_query_device_resp resp;
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr;
- struct ib_device *device;
int err;
- device = file->device->ib_dev;
if (ucore->inlen < sizeof(cmd))
return -EINVAL;
@@ -3332,11 +3624,13 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
if (ucore->outlen < resp.response_length)
return -ENOSPC;
- err = device->query_device(device, &attr);
+ memset(&attr, 0, sizeof(attr));
+
+ err = ib_dev->query_device(ib_dev, &attr, uhw);
if (err)
return err;
- copy_query_dev_fields(file, &resp.base, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
resp.comp_mask = 0;
if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
@@ -3356,6 +3650,18 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
#endif
resp.response_length += sizeof(resp.odp_caps);
+ if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+ goto end;
+
+ resp.timestamp_mask = attr.timestamp_mask;
+ resp.response_length += sizeof(resp.timestamp_mask);
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+ goto end;
+
+ resp.hca_core_clock = attr.hca_core_clock;
+ resp.response_length += sizeof(resp.hca_core_clock);
+
end:
err = ib_copy_to_udata(ucore, &resp, resp.response_length);
if (err)
diff --git a/kernel/drivers/infiniband/core/uverbs_main.c b/kernel/drivers/infiniband/core/uverbs_main.c
index 09686d49d..e3ef28861 100644
--- a/kernel/drivers/infiniband/core/uverbs_main.c
+++ b/kernel/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len) = {
[IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
@@ -119,21 +120,25 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
};
static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
[IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
+ [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq,
+ [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp,
};
static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
static void ib_uverbs_release_dev(struct kobject *kobj)
{
struct ib_uverbs_device *dev =
container_of(kobj, struct ib_uverbs_device, kobj);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
kfree(dev);
}
@@ -204,9 +209,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
{
struct ib_uobject *uobj, *tmp;
- if (!context)
- return 0;
-
context->closing = 1;
list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -315,8 +317,16 @@ static void ib_uverbs_release_file(struct kref *ref)
{
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->disassociate_ucontext)
+ module_put(ib_dev->owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
- module_put(file->device->ib_dev->owner);
if (atomic_dec_and_test(&file->device->refcount))
ib_uverbs_comp_dev(file->device);
@@ -340,9 +350,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
return -EAGAIN;
if (wait_event_interruptible(file->poll_wait,
- !list_empty(&file->event_list)))
+ (!list_empty(&file->event_list) ||
+ /* The barriers built into wait_event_interruptible()
+ * and wake_up() guarentee this will see the null set
+ * without using RCU
+ */
+ !file->uverbs_file->device->ib_dev)))
return -ERESTARTSYS;
+ /* If device was disassociated and no event exists set an error */
+ if (list_empty(&file->event_list) &&
+ !file->uverbs_file->device->ib_dev)
+ return -EIO;
+
spin_lock_irq(&file->lock);
}
@@ -405,8 +425,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_event_file *file = filp->private_data;
struct ib_uverbs_event *entry, *tmp;
+ int closed_already = 0;
+ mutex_lock(&file->uverbs_file->device->lists_mutex);
spin_lock_irq(&file->lock);
+ closed_already = file->is_closed;
file->is_closed = 1;
list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
if (entry->counter)
@@ -414,11 +437,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
kfree(entry);
}
spin_unlock_irq(&file->lock);
-
- if (file->is_async) {
- ib_unregister_event_handler(&file->uverbs_file->event_handler);
- kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ if (!closed_already) {
+ list_del(&file->list);
+ if (file->is_async)
+ ib_unregister_event_handler(&file->uverbs_file->
+ event_handler);
}
+ mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
kref_put(&file->ref, ib_uverbs_release_event_file);
return 0;
@@ -550,13 +577,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
NULL, NULL);
}
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+ file->async_file = NULL;
+}
+
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
+ int ret;
- ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
@@ -565,16 +600,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(&ev_file->event_list);
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
+ kref_get(&ev_file->uverbs_file->ref);
ev_file->async_queue = NULL;
- ev_file->is_async = is_async;
ev_file->is_closed = 0;
filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
ev_file, O_RDONLY);
if (IS_ERR(filp))
- kfree(ev_file);
+ goto err_put_refs;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ list_add_tail(&ev_file->list,
+ &uverbs_file->device->uverbs_events_file_list);
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ if (is_async) {
+ WARN_ON(uverbs_file->async_file);
+ uverbs_file->async_file = ev_file;
+ kref_get(&uverbs_file->async_file->ref);
+ INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+ ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&uverbs_file->event_handler);
+ if (ret)
+ goto err_put_file;
+
+ /* At that point async file stuff was fully set */
+ ev_file->is_async = 1;
+ }
return filp;
+
+err_put_file:
+ fput(filp);
+ kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+ uverbs_file->async_file = NULL;
+ return ERR_PTR(ret);
+
+err_put_refs:
+ kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ return filp;
}
/*
@@ -610,8 +676,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
struct ib_uverbs_cmd_hdr hdr;
__u32 flags;
+ int srcu_key;
+ ssize_t ret;
if (count < sizeof hdr)
return -EINVAL;
@@ -619,6 +688,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
+
flags = (hdr.command &
IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
@@ -626,26 +703,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
__u32 command;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
- !uverbs_cmd_table[command])
- return -EINVAL;
+ !uverbs_cmd_table[command]) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!file->ucontext &&
- command != IB_USER_VERBS_CMD_GET_CONTEXT)
- return -EINVAL;
+ command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (hdr.in_words * 4 != count)
- return -EINVAL;
+ if (hdr.in_words * 4 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- return uverbs_cmd_table[command](file,
+ ret = uverbs_cmd_table[command](file, ib_dev,
buf + sizeof(hdr),
hdr.in_words * 4,
hdr.out_words * 4);
@@ -656,51 +743,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
struct ib_uverbs_ex_cmd_hdr ex_hdr;
struct ib_udata ucore;
struct ib_udata uhw;
- int err;
size_t written_count = count;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
- !uverbs_ex_cmd_table[command])
- return -ENOSYS;
+ !uverbs_ex_cmd_table[command]) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (!file->ucontext)
- return -EINVAL;
+ if (!file->ucontext) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (count < (sizeof(hdr) + sizeof(ex_hdr)))
- return -EINVAL;
+ if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
- return -EFAULT;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+ ret = -EFAULT;
+ goto out;
+ }
count -= sizeof(hdr) + sizeof(ex_hdr);
buf += sizeof(hdr) + sizeof(ex_hdr);
- if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
- return -EINVAL;
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (ex_hdr.cmd_hdr_reserved)
- return -EINVAL;
+ if (ex_hdr.cmd_hdr_reserved) {
+ ret = -EINVAL;
+ goto out;
+ }
if (ex_hdr.response) {
- if (!hdr.out_words && !ex_hdr.provider_out_words)
- return -EINVAL;
+ if (!hdr.out_words && !ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!access_ok(VERIFY_WRITE,
(void __user *) (unsigned long) ex_hdr.response,
- (hdr.out_words + ex_hdr.provider_out_words) * 8))
- return -EFAULT;
+ (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
- if (hdr.out_words || ex_hdr.provider_out_words)
- return -EINVAL;
+ if (hdr.out_words || ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
}
INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -712,27 +820,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
ex_hdr.provider_in_words * 8,
ex_hdr.provider_out_words * 8);
- err = uverbs_ex_cmd_table[command](file,
+ ret = uverbs_ex_cmd_table[command](file,
+ ib_dev,
&ucore,
&uhw);
-
- if (err)
- return err;
-
- return written_count;
+ if (!ret)
+ ret = written_count;
+ } else {
+ ret = -ENOSYS;
}
- return -ENOSYS;
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
+ int ret = 0;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
if (!file->ucontext)
- return -ENODEV;
+ ret = -ENODEV;
else
- return file->device->ib_dev->mmap(file->ucontext, vma);
+ ret = ib_dev->mmap(file->ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
/*
@@ -749,21 +873,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
{
struct ib_uverbs_device *dev;
struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
int ret;
+ int module_dependent;
+ int srcu_key;
dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
if (!atomic_inc_not_zero(&dev->refcount))
return -ENXIO;
- if (!try_module_get(dev->ib_dev->owner)) {
- ret = -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
goto err;
}
- file = kmalloc(sizeof *file, GFP_KERNEL);
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file) {
ret = -ENOMEM;
- goto err_module;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
}
file->device = dev;
@@ -774,13 +920,18 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
filp->private_data = file;
kobject_get(&dev->kobj);
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
return nonseekable_open(inode, filp);
err_module:
- module_put(dev->ib_dev->owner);
+ module_put(ib_dev->owner);
err:
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
if (atomic_dec_and_test(&dev->refcount))
ib_uverbs_comp_dev(dev);
@@ -791,8 +942,18 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_file *file = filp->private_data;
struct ib_uverbs_device *dev = file->device;
-
- ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ struct ib_ucontext *ucontext = NULL;
+
+ mutex_lock(&file->device->lists_mutex);
+ ucontext = file->ucontext;
+ file->ucontext = NULL;
+ if (!file->is_closed) {
+ list_del(&file->list);
+ file->is_closed = 1;
+ }
+ mutex_unlock(&file->device->lists_mutex);
+ if (ucontext)
+ ib_uverbs_cleanup_ucontext(file, ucontext);
if (file->async_file)
kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -829,12 +990,21 @@ static struct ib_client uverbs_client = {
static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
char *buf)
{
+ int ret = -ENODEV;
+ int srcu_key;
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
- return sprintf(buf, "%s\n", dev->ib_dev->name);
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%s\n", ib_dev->name);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
}
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
@@ -842,11 +1012,19 @@ static ssize_t show_dev_abi_version(struct device *device,
struct device_attribute *attr, char *buf)
{
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+ return ret;
}
static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
@@ -886,6 +1064,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
int devnum;
dev_t base;
struct ib_uverbs_device *uverbs_dev;
+ int ret;
if (!device->alloc_ucontext)
return;
@@ -894,11 +1073,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
if (!uverbs_dev)
return;
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return;
+ }
+
atomic_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
spin_lock(&map_lock);
devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -919,7 +1107,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
}
spin_unlock(&map_lock);
- uverbs_dev->ib_dev = device;
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
uverbs_dev->num_comp_vectors = device->num_comp_vectors;
cdev_init(&uverbs_dev->cdev, NULL);
@@ -963,9 +1151,72 @@ err:
return;
}
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
{
- struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_file *file;
+ struct ib_uverbs_event_file *event_file;
+ struct ib_event event;
+
+ /* Pending running commands to terminate */
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ event.device = ib_dev;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ struct ib_ucontext *ucontext;
+
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ file->is_closed = 1;
+ ucontext = file->ucontext;
+ list_del(&file->list);
+ file->ucontext = NULL;
+ kref_get(&file->ref);
+ mutex_unlock(&uverbs_dev->lists_mutex);
+ /* We must release the mutex before going ahead and calling
+ * disassociate_ucontext. disassociate_ucontext might end up
+ * indirectly calling uverbs_close, for example due to freeing
+ * the resources (e.g mmput).
+ */
+ ib_uverbs_event_handler(&file->event_handler, &event);
+ if (ucontext) {
+ ib_dev->disassociate_ucontext(ucontext);
+ ib_uverbs_cleanup_ucontext(file, ucontext);
+ }
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ kref_put(&file->ref, ib_uverbs_release_file);
+ }
+
+ while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+ event_file = list_first_entry(&uverbs_dev->
+ uverbs_events_file_list,
+ struct ib_uverbs_event_file,
+ list);
+ spin_lock_irq(&event_file->lock);
+ event_file->is_closed = 1;
+ spin_unlock_irq(&event_file->lock);
+
+ list_del(&event_file->list);
+ if (event_file->is_async) {
+ ib_unregister_event_handler(&event_file->uverbs_file->
+ event_handler);
+ event_file->uverbs_file->event_handler.device = NULL;
+ }
+
+ wake_up_interruptible(&event_file->poll_wait);
+ kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
if (!uverbs_dev)
return;
@@ -979,9 +1230,27 @@ static void ib_uverbs_remove_one(struct ib_device *device)
else
clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+ if (device->disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
if (atomic_dec_and_test(&uverbs_dev->refcount))
ib_uverbs_comp_dev(uverbs_dev);
- wait_for_completion(&uverbs_dev->comp);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
kobject_put(&uverbs_dev->kobj);
}
diff --git a/kernel/drivers/infiniband/core/uverbs_marshall.c b/kernel/drivers/infiniband/core/uverbs_marshall.c
index abd972474..7d2f14c9b 100644
--- a/kernel/drivers/infiniband/core/uverbs_marshall.c
+++ b/kernel/drivers/infiniband/core/uverbs_marshall.c
@@ -141,8 +141,8 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
dst->preference = src->preference;
dst->packet_life_time_selector = src->packet_life_time_selector;
- memset(dst->smac, 0, sizeof(dst->smac));
memset(dst->dmac, 0, sizeof(dst->dmac));
- dst->vlan_id = 0xffff;
+ dst->net = NULL;
+ dst->ifindex = 0;
}
EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/kernel/drivers/infiniband/core/verbs.c b/kernel/drivers/infiniband/core/verbs.c
index f93eb8da7..545906dec 100644
--- a/kernel/drivers/infiniband/core/verbs.c
+++ b/kernel/drivers/infiniband/core/verbs.c
@@ -41,6 +41,9 @@
#include <linux/export.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/addrconf.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
@@ -48,6 +51,71 @@
#include "core_priv.h"
+static const char * const ib_events[] = {
+ [IB_EVENT_CQ_ERR] = "CQ error",
+ [IB_EVENT_QP_FATAL] = "QP fatal error",
+ [IB_EVENT_QP_REQ_ERR] = "QP request error",
+ [IB_EVENT_QP_ACCESS_ERR] = "QP access error",
+ [IB_EVENT_COMM_EST] = "communication established",
+ [IB_EVENT_SQ_DRAINED] = "send queue drained",
+ [IB_EVENT_PATH_MIG] = "path migration successful",
+ [IB_EVENT_PATH_MIG_ERR] = "path migration error",
+ [IB_EVENT_DEVICE_FATAL] = "device fatal error",
+ [IB_EVENT_PORT_ACTIVE] = "port active",
+ [IB_EVENT_PORT_ERR] = "port error",
+ [IB_EVENT_LID_CHANGE] = "LID change",
+ [IB_EVENT_PKEY_CHANGE] = "P_key change",
+ [IB_EVENT_SM_CHANGE] = "SM change",
+ [IB_EVENT_SRQ_ERR] = "SRQ error",
+ [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached",
+ [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached",
+ [IB_EVENT_CLIENT_REREGISTER] = "client reregister",
+ [IB_EVENT_GID_CHANGE] = "GID changed",
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
+ ib_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(ib_event_msg);
+
+static const char * const wc_statuses[] = {
+ [IB_WC_SUCCESS] = "success",
+ [IB_WC_LOC_LEN_ERR] = "local length error",
+ [IB_WC_LOC_QP_OP_ERR] = "local QP operation error",
+ [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error",
+ [IB_WC_LOC_PROT_ERR] = "local protection error",
+ [IB_WC_WR_FLUSH_ERR] = "WR flushed",
+ [IB_WC_MW_BIND_ERR] = "memory management operation error",
+ [IB_WC_BAD_RESP_ERR] = "bad response error",
+ [IB_WC_LOC_ACCESS_ERR] = "local access error",
+ [IB_WC_REM_INV_REQ_ERR] = "invalid request error",
+ [IB_WC_REM_ACCESS_ERR] = "remote access error",
+ [IB_WC_REM_OP_ERR] = "remote operation error",
+ [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded",
+ [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded",
+ [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error",
+ [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request",
+ [IB_WC_REM_ABORT_ERR] = "operation aborted",
+ [IB_WC_INV_EECN_ERR] = "invalid EE context number",
+ [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state",
+ [IB_WC_FATAL_ERR] = "fatal error",
+ [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error",
+ [IB_WC_GENERAL_ERR] = "general error",
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
+{
+ size_t index = status;
+
+ return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
+ wc_statuses[index] : "unrecognized status";
+}
+EXPORT_SYMBOL(ib_wc_status_msg);
+
__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
{
switch (rate) {
@@ -148,28 +216,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
/* Protection domains */
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
struct ib_pd *ib_alloc_pd(struct ib_device *device)
{
struct ib_pd *pd;
+ struct ib_device_attr devattr;
+ int rc;
+
+ rc = ib_query_device(device, &devattr);
+ if (rc)
+ return ERR_PTR(rc);
pd = device->alloc_pd(device, NULL, NULL);
+ if (IS_ERR(pd))
+ return pd;
+
+ pd->device = device;
+ pd->uobject = NULL;
+ pd->local_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else {
+ struct ib_mr *mr;
+
+ mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return (struct ib_pd *)mr;
+ }
- if (!IS_ERR(pd)) {
- pd->device = device;
- pd->uobject = NULL;
- atomic_set(&pd->usecnt, 0);
+ pd->local_mr = mr;
+ pd->local_dma_lkey = pd->local_mr->lkey;
}
-
return pd;
}
EXPORT_SYMBOL(ib_alloc_pd);
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
{
- if (atomic_read(&pd->usecnt))
- return -EBUSY;
+ int ret;
- return pd->device->dealloc_pd(pd);
+ if (pd->local_mr) {
+ ret = ib_dereg_mr(pd->local_mr);
+ WARN_ON(ret);
+ pd->local_mr = NULL;
+ }
+
+ /* uverbs manipulates usecnt with proper locking, while the kabi
+ requires the caller to guarantee we can't race here. */
+ WARN_ON(atomic_read(&pd->usecnt));
+
+ /* Making delalloc_pd a void return is a WIP, no driver should return
+ an error here. */
+ ret = pd->device->dealloc_pd(pd);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
}
EXPORT_SYMBOL(ib_dealloc_pd);
@@ -192,32 +311,69 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
}
EXPORT_SYMBOL(ib_create_ah);
-int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
- struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+struct find_gid_index_context {
+ u16 vlan_id;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+ const struct ib_gid_attr *gid_attr,
+ void *context)
+{
+ struct find_gid_index_context *ctx =
+ (struct find_gid_index_context *)context;
+
+ if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
+ (is_vlan_dev(gid_attr->ndev) &&
+ vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+ return false;
+
+ return true;
+}
+
+static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
+ u16 vlan_id, const union ib_gid *sgid,
+ u16 *gid_index)
+{
+ struct find_gid_index_context context = {.vlan_id = vlan_id};
+
+ return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+ &context, gid_index);
+}
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct ib_ah_attr *ah_attr)
{
u32 flow_class;
u16 gid_index;
int ret;
- int is_eth = (rdma_port_get_link_layer(device, port_num) ==
- IB_LINK_LAYER_ETHERNET);
memset(ah_attr, 0, sizeof *ah_attr);
- if (is_eth) {
+ if (rdma_cap_eth_ah(device, port_num)) {
+ u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+ wc->vlan_id : 0xffff;
+
if (!(wc->wc_flags & IB_WC_GRH))
return -EPROTOTYPE;
- if (wc->wc_flags & IB_WC_WITH_SMAC &&
- wc->wc_flags & IB_WC_WITH_VLAN) {
- memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
- ah_attr->vlan_id = wc->vlan_id;
- } else {
+ if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
+ !(wc->wc_flags & IB_WC_WITH_VLAN)) {
ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
- ah_attr->dmac, &ah_attr->vlan_id);
+ ah_attr->dmac,
+ wc->wc_flags & IB_WC_WITH_VLAN ?
+ NULL : &vlan_id,
+ 0);
if (ret)
return ret;
}
- } else {
- ah_attr->vlan_id = 0xffff;
+
+ ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+ &grh->dgid, &gid_index);
+ if (ret)
+ return ret;
+
+ if (wc->wc_flags & IB_WC_WITH_SMAC)
+ memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
}
ah_attr->dlid = wc->slid;
@@ -229,10 +385,13 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
ah_attr->ah_flags = IB_AH_GRH;
ah_attr->grh.dgid = grh->sgid;
- ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
- &gid_index);
- if (ret)
- return ret;
+ if (!rdma_cap_eth_ah(device, port_num)) {
+ ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+ port_num, NULL,
+ &gid_index);
+ if (ret)
+ return ret;
+ }
ah_attr->grh.sgid_index = (u8) gid_index;
flow_class = be32_to_cpu(grh->version_tclass_flow);
@@ -244,8 +403,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
}
EXPORT_SYMBOL(ib_init_ah_from_wc);
-struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
- struct ib_grh *grh, u8 port_num)
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u8 port_num)
{
struct ib_ah_attr ah_attr;
int ret;
@@ -502,9 +661,7 @@ EXPORT_SYMBOL(ib_create_qp);
static const struct {
int valid;
enum ib_qp_attr_mask req_param[IB_QPT_MAX];
- enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX];
enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
- enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX];
} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
[IB_QPS_RESET] = {
[IB_QPS_RESET] = { .valid = 1 },
@@ -585,12 +742,6 @@ static const struct {
IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_MIN_RNR_TIMER),
},
- .req_param_add_eth = {
- [IB_QPT_RC] = (IB_QP_SMAC),
- [IB_QPT_UC] = (IB_QP_SMAC),
- [IB_QPT_XRC_INI] = (IB_QP_SMAC),
- [IB_QPT_XRC_TGT] = (IB_QP_SMAC)
- },
.opt_param = {
[IB_QPT_UD] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
@@ -611,21 +762,7 @@ static const struct {
[IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
IB_QP_QKEY),
},
- .opt_param_add_eth = {
- [IB_QPT_RC] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_UC] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID)
- }
- }
+ },
},
[IB_QPS_RTR] = {
[IB_QPS_RESET] = { .valid = 1 },
@@ -847,13 +984,6 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
req_param = qp_state_table[cur_state][next_state].req_param[type];
opt_param = qp_state_table[cur_state][next_state].opt_param[type];
- if (ll == IB_LINK_LAYER_ETHERNET) {
- req_param |= qp_state_table[cur_state][next_state].
- req_param_add_eth[type];
- opt_param |= qp_state_table[cur_state][next_state].
- opt_param_add_eth[type];
- }
-
if ((mask & req_param) != req_param)
return 0;
@@ -864,40 +994,52 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
}
EXPORT_SYMBOL(ib_modify_qp_is_ok);
-int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
- struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
{
int ret = 0;
- union ib_gid sgid;
- if ((*qp_attr_mask & IB_QP_AV) &&
- (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
- ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
- qp_attr->ah_attr.grh.sgid_index, &sgid);
- if (ret)
- goto out;
+ if (*qp_attr_mask & IB_QP_AV) {
+ if (qp_attr->ah_attr.port_num < rdma_start_port(qp->device) ||
+ qp_attr->ah_attr.port_num > rdma_end_port(qp->device))
+ return -EINVAL;
+
+ if (!rdma_cap_eth_ah(qp->device, qp_attr->ah_attr.port_num))
+ return 0;
+
if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
- rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
- rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
- if (!(*qp_attr_mask & IB_QP_VID))
- qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+ rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw,
+ qp_attr->ah_attr.dmac);
} else {
- ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
- qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
- if (ret)
- goto out;
- ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
- if (ret)
+ union ib_gid sgid;
+ struct ib_gid_attr sgid_attr;
+ int ifindex;
+
+ ret = ib_query_gid(qp->device,
+ qp_attr->ah_attr.port_num,
+ qp_attr->ah_attr.grh.sgid_index,
+ &sgid, &sgid_attr);
+
+ if (ret || !sgid_attr.ndev) {
+ if (!ret)
+ ret = -ENXIO;
goto out;
+ }
+
+ ifindex = sgid_attr.ndev->ifindex;
+
+ ret = rdma_addr_find_dmac_by_grh(&sgid,
+ &qp_attr->ah_attr.grh.dgid,
+ qp_attr->ah_attr.dmac,
+ NULL, ifindex);
+
+ dev_put(sgid_attr.ndev);
}
- *qp_attr_mask |= IB_QP_SMAC;
- if (qp_attr->vlan_id < 0xFFFF)
- *qp_attr_mask |= IB_QP_VID;
}
out:
return ret;
}
-EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+EXPORT_SYMBOL(ib_resolve_eth_dmac);
int ib_modify_qp(struct ib_qp *qp,
@@ -906,7 +1048,7 @@ int ib_modify_qp(struct ib_qp *qp,
{
int ret;
- ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+ ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask);
if (ret)
return ret;
@@ -1012,11 +1154,12 @@ EXPORT_SYMBOL(ib_destroy_qp);
struct ib_cq *ib_create_cq(struct ib_device *device,
ib_comp_handler comp_handler,
void (*event_handler)(struct ib_event *, void *),
- void *cq_context, int cqe, int comp_vector)
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr)
{
struct ib_cq *cq;
- cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+ cq = device->create_cq(device, cq_attr, NULL, NULL);
if (!IS_ERR(cq)) {
cq->device = device;
@@ -1079,73 +1222,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
}
EXPORT_SYMBOL(ib_get_dma_mr);
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_mr *mr;
- int err;
-
- err = ib_check_mr_access(mr_access_flags);
- if (err)
- return ERR_PTR(err);
-
- if (!pd->device->reg_phys_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_pd *old_pd;
- int ret;
-
- ret = ib_check_mr_access(mr_access_flags);
- if (ret)
- return ret;
-
- if (!mr->device->rereg_phys_mr)
- return -ENOSYS;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- old_pd = mr->pd;
-
- ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
- phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
- atomic_dec(&old_pd->usecnt);
- atomic_inc(&pd->usecnt);
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
{
return mr->device->query_mr ?
@@ -1170,54 +1246,28 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr)
-{
- struct ib_mr *mr;
-
- if (!pd->device->create_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->create_mr(pd, mr_init_attr);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
- struct ib_pd *pd;
- int ret;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
- ret = mr->device->destroy_mr(mr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
{
struct ib_mr *mr;
- if (!pd->device->alloc_fast_reg_mr)
+ if (!pd->device->alloc_mr)
return ERR_PTR(-ENOSYS);
- mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+ mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
if (!IS_ERR(mr)) {
mr->device = pd->device;
mr->pd = pd;
@@ -1228,32 +1278,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
return mr;
}
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
-
-struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
- int max_page_list_len)
-{
- struct ib_fast_reg_page_list *page_list;
-
- if (!device->alloc_fast_reg_page_list)
- return ERR_PTR(-ENOSYS);
-
- page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
-
- if (!IS_ERR(page_list)) {
- page_list->device = device;
- page_list->max_page_list_len = max_page_list_len;
- }
-
- return page_list;
-}
-EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
-
-void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
-{
- page_list->device->free_fast_reg_page_list(page_list);
-}
-EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+EXPORT_SYMBOL(ib_alloc_mr);
/* Memory windows */
@@ -1446,3 +1471,111 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
}
EXPORT_SYMBOL(ib_check_mr_status);
+
+/**
+ * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
+ * and set it the memory region.
+ * @mr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @page_size: page vector desired page size
+ *
+ * Constraints:
+ * - The first sg element is allowed to have an offset.
+ * - Each sg element must be aligned to page_size (or physically
+ * contiguous to the previous element). In case an sg element has a
+ * non contiguous offset, the mapping prefix will not include it.
+ * - The last sg element is allowed to have length less than page_size.
+ * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
+ * then only max_num_sg entries will be mapped.
+ *
+ * Returns the number of sg elements that were mapped to the memory region.
+ *
+ * After this completes successfully, the memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr,
+ struct scatterlist *sg,
+ int sg_nents,
+ unsigned int page_size)
+{
+ if (unlikely(!mr->device->map_mr_sg))
+ return -ENOSYS;
+
+ mr->page_size = page_size;
+
+ return mr->device->map_mr_sg(mr, sg, sg_nents);
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert the largest prefix of a sg list
+ * to a page vector
+ * @mr: memory region
+ * @sgl: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @set_page: driver page assignment function pointer
+ *
+ * Core service helper for drivers to convert the largest
+ * prefix of given sg list to a page vector. The sg list
+ * prefix converted is the prefix that meet the requirements
+ * of ib_map_mr_sg.
+ *
+ * Returns the number of sg elements that were assigned to
+ * a page vector.
+ */
+int ib_sg_to_pages(struct ib_mr *mr,
+ struct scatterlist *sgl,
+ int sg_nents,
+ int (*set_page)(struct ib_mr *, u64))
+{
+ struct scatterlist *sg;
+ u64 last_end_dma_addr = 0, last_page_addr = 0;
+ unsigned int last_page_off = 0;
+ u64 page_mask = ~((u64)mr->page_size - 1);
+ int i, ret;
+
+ mr->iova = sg_dma_address(&sgl[0]);
+ mr->length = 0;
+
+ for_each_sg(sgl, sg, sg_nents, i) {
+ u64 dma_addr = sg_dma_address(sg);
+ unsigned int dma_len = sg_dma_len(sg);
+ u64 end_dma_addr = dma_addr + dma_len;
+ u64 page_addr = dma_addr & page_mask;
+
+ /*
+ * For the second and later elements, check whether either the
+ * end of element i-1 or the start of element i is not aligned
+ * on a page boundary.
+ */
+ if (i && (last_page_off != 0 || page_addr != dma_addr)) {
+ /* Stop mapping if there is a gap. */
+ if (last_end_dma_addr != dma_addr)
+ break;
+
+ /*
+ * Coalesce this element with the last. If it is small
+ * enough just update mr->length. Otherwise start
+ * mapping from the next page.
+ */
+ goto next_page;
+ }
+
+ do {
+ ret = set_page(mr, page_addr);
+ if (unlikely(ret < 0))
+ return i ? : ret;
+next_page:
+ page_addr += mr->page_size;
+ } while (page_addr < end_dma_addr);
+
+ mr->length += dma_len;
+ last_end_dma_addr = end_dma_addr;
+ last_page_addr = end_dma_addr & page_mask;
+ last_page_off = end_dma_addr & ~page_mask;
+ }
+
+ return i;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);