diff options
Diffstat (limited to 'kernel/drivers/infiniband/core')
31 files changed, 5709 insertions, 1811 deletions
diff --git a/kernel/drivers/infiniband/core/Makefile b/kernel/drivers/infiniband/core/Makefile index acf736764..d43a8994a 100644 --- a/kernel/drivers/infiniband/core/Makefile +++ b/kernel/drivers/infiniband/core/Makefile @@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ - device.o fmr_pool.o cache.o netlink.o + device.o fmr_pool.o cache.o netlink.o \ + roce_gid_mgmt.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o diff --git a/kernel/drivers/infiniband/core/addr.c b/kernel/drivers/infiniband/core/addr.c index 38339d220..34b1adad0 100644 --- a/kernel/drivers/infiniband/core/addr.c +++ b/kernel/drivers/infiniband/core/addr.c @@ -128,7 +128,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, int ret = -EADDRNOTAVAIL; if (dev_addr->bound_dev_if) { - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; ret = rdma_copy_addr(dev_addr, dev, NULL); @@ -138,7 +138,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, switch (addr->sa_family) { case AF_INET: - dev = ip_dev_find(&init_net, + dev = ip_dev_find(dev_addr->net, ((struct sockaddr_in *) addr)->sin_addr.s_addr); if (!dev) @@ -149,12 +149,11 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; - #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - if (ipv6_chk_addr(&init_net, + for_each_netdev_rcu(dev_addr->net, dev) { + if (ipv6_chk_addr(dev_addr->net, &((struct sockaddr_in6 *) addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); @@ -236,7 +235,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, fl4.daddr = dst_ip; fl4.saddr = src_ip; fl4.flowi4_oif = addr->bound_dev_if; - rt = ip_route_output_key(&init_net, &fl4); + rt = ip_route_output_key(addr->net, &fl4); if (IS_ERR(rt)) { ret = PTR_ERR(rt); goto out; @@ -278,12 +277,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, fl6.saddr = src_in->sin6_addr; fl6.flowi6_oif = addr->bound_dev_if; - dst = ip6_route_output(&init_net, NULL, &fl6); + dst = ip6_route_output(addr->net, NULL, &fl6); if ((ret = dst->error)) goto put; if (ipv6_addr_any(&fl6.saddr)) { - ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, + ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, &fl6.daddr, 0, &fl6.saddr); if (ret) goto put; @@ -457,8 +456,8 @@ static void resolve_cb(int status, struct sockaddr *src_addr, complete(&((struct resolve_cb_context *)context)->comp); } -int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, - u16 *vlan_id) +int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int if_index) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -476,6 +475,8 @@ int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.bound_dev_if = if_index; + dev_addr.net = &init_net; ctx.addr = &dev_addr; init_completion(&ctx.comp); @@ -510,6 +511,7 @@ int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) rdma_gid2ip(&gid_addr._sockaddr, sgid); memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.net = &init_net; ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); if (ret) return ret; diff --git a/kernel/drivers/infiniband/core/agent.c b/kernel/drivers/infiniband/core/agent.c index f6d29614c..4fa524dfb 100644 --- a/kernel/drivers/infiniband/core/agent.c +++ b/kernel/drivers/infiniband/core/agent.c @@ -54,7 +54,7 @@ static DEFINE_SPINLOCK(ib_agent_port_list_lock); static LIST_HEAD(ib_agent_port_list); static struct ib_agent_port_private * -__ib_get_agent_port(struct ib_device *device, int port_num) +__ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; @@ -67,7 +67,7 @@ __ib_get_agent_port(struct ib_device *device, int port_num) } static struct ib_agent_port_private * -ib_get_agent_port(struct ib_device *device, int port_num) +ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; unsigned long flags; @@ -78,9 +78,9 @@ ib_get_agent_port(struct ib_device *device, int port_num) return entry; } -void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn) +void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; @@ -88,7 +88,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); @@ -106,22 +106,27 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, return; } + if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) + resp_mad_len = IB_MGMT_MAD_SIZE; + send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, - IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_KERNEL); + IB_MGMT_MAD_HDR, + resp_mad_len - IB_MGMT_MAD_HDR, + GFP_KERNEL, + mad_hdr->base_version); if (IS_ERR(send_buf)) { dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } - memcpy(send_buf->mad, mad, sizeof *mad); + memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); - mad_send_wr->send_wr.wr.ud.port_num = port_num; + mad_send_wr->send_wr.port_num = port_num; } if (ib_post_send_mad(send_buf, NULL)) { @@ -156,7 +161,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error1; } - if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_smi(device, port_num)) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, diff --git a/kernel/drivers/infiniband/core/agent.h b/kernel/drivers/infiniband/core/agent.h index 666928700..65f92beda 100644 --- a/kernel/drivers/infiniband/core/agent.h +++ b/kernel/drivers/infiniband/core/agent.h @@ -44,8 +44,8 @@ extern int ib_agent_port_open(struct ib_device *device, int port_num); extern int ib_agent_port_close(struct ib_device *device, int port_num); -extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn); +extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa); #endif /* __AGENT_H_ */ diff --git a/kernel/drivers/infiniband/core/cache.c b/kernel/drivers/infiniband/core/cache.c index 80f6cf244..89bebeada 100644 --- a/kernel/drivers/infiniband/core/cache.c +++ b/kernel/drivers/infiniband/core/cache.c @@ -37,6 +37,8 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> +#include <linux/netdevice.h> +#include <net/addrconf.h> #include <rdma/ib_cache.h> @@ -47,90 +49,720 @@ struct ib_pkey_cache { u16 table[0]; }; -struct ib_gid_cache { - int table_len; - union ib_gid table[0]; -}; - struct ib_update_work { struct work_struct work; struct ib_device *device; u8 port_num; }; -static inline int start_port(struct ib_device *device) +union ib_gid zgid; +EXPORT_SYMBOL(zgid); + +static const struct ib_gid_attr zattr; + +enum gid_attr_find_mask { + GID_ATTR_FIND_MASK_GID = 1UL << 0, + GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, + GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, +}; + +enum gid_table_entry_props { + GID_TABLE_ENTRY_INVALID = 1UL << 0, + GID_TABLE_ENTRY_DEFAULT = 1UL << 1, +}; + +enum gid_table_write_action { + GID_TABLE_WRITE_ACTION_ADD, + GID_TABLE_WRITE_ACTION_DEL, + /* MODIFY only updates the GID table. Currently only used by + * ib_cache_update. + */ + GID_TABLE_WRITE_ACTION_MODIFY +}; + +struct ib_gid_table_entry { + /* This lock protects an entry from being + * read and written simultaneously. + */ + rwlock_t lock; + unsigned long props; + union ib_gid gid; + struct ib_gid_attr attr; + void *context; +}; + +struct ib_gid_table { + int sz; + /* In RoCE, adding a GID to the table requires: + * (a) Find if this GID is already exists. + * (b) Find a free space. + * (c) Write the new GID + * + * Delete requires different set of operations: + * (a) Find the GID + * (b) Delete it. + * + * Add/delete should be carried out atomically. + * This is done by locking this mutex from multiple + * writers. We don't need this lock for IB, as the MAD + * layer replaces all entries. All data_vec entries + * are locked by this lock. + **/ + struct mutex lock; + struct ib_gid_table_entry *data_vec; +}; + +static int write_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + enum gid_table_write_action action, + bool default_gid) +{ + int ret = 0; + struct net_device *old_net_dev; + unsigned long flags; + + /* in rdma_cap_roce_gid_table, this funciton should be protected by a + * sleep-able lock. + */ + write_lock_irqsave(&table->data_vec[ix].lock, flags); + + if (rdma_cap_roce_gid_table(ib_dev, port)) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by + * RoCE providers and thus only updates the cache. + */ + if (action == GID_TABLE_WRITE_ACTION_ADD) + ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, + &table->data_vec[ix].context); + else if (action == GID_TABLE_WRITE_ACTION_DEL) + ret = ib_dev->del_gid(ib_dev, port, ix, + &table->data_vec[ix].context); + write_lock_irqsave(&table->data_vec[ix].lock, flags); + } + + old_net_dev = table->data_vec[ix].attr.ndev; + if (old_net_dev && old_net_dev != attr->ndev) + dev_put(old_net_dev); + /* if modify_gid failed, just delete the old gid */ + if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { + gid = &zgid; + attr = &zattr; + table->data_vec[ix].context = NULL; + } + if (default_gid) + table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); + memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); + if (table->data_vec[ix].attr.ndev && + table->data_vec[ix].attr.ndev != old_net_dev) + dev_hold(table->data_vec[ix].attr.ndev); + + table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + + write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + + if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } + return ret; +} + +static int add_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_ADD, default_gid); +} + +static int modify_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +} + +static int del_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, &zgid, &zattr, + GID_TABLE_WRITE_ACTION_DEL, default_gid); +} + +static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, + const struct ib_gid_attr *val, bool default_gid, + unsigned long mask) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; + int i; + + for (i = 0; i < table->sz; i++) { + unsigned long flags; + struct ib_gid_attr *attr = &table->data_vec[i].attr; + + read_lock_irqsave(&table->data_vec[i].lock, flags); + + if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + goto next; + + if (mask & GID_ATTR_FIND_MASK_GID && + memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + goto next; + + if (mask & GID_ATTR_FIND_MASK_NETDEV && + attr->ndev != val->ndev) + goto next; + + if (mask & GID_ATTR_FIND_MASK_DEFAULT && + !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != + default_gid) + goto next; + + read_unlock_irqrestore(&table->data_vec[i].lock, flags); + return i; +next: + read_unlock_irqrestore(&table->data_vec[i].lock, flags); + } + + return -1; } -static inline int end_port(struct ib_device *device) +static void make_default_gid(struct net_device *dev, union ib_gid *gid) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_ifid_eui48(&gid->raw[8], dev); } -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid) +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) { - struct ib_gid_cache *cache; - unsigned long flags; + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; int ret = 0; + struct net_device *idev; - if (port_num < start_port(device) || port_num > end_port(device)) + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (!memcmp(gid, &zgid, sizeof(*gid))) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + if (ib_dev->get_netdev) { + idev = ib_dev->get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; - cache = device->cache.gid_cache[port_num - start_port(device)]; + /* Adding default GIDs in not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { + dev_put(idev); + return -EPERM; + } + } + if (idev) + dev_put(idev); + } - if (index < 0 || index >= cache->table_len) - ret = -EINVAL; - else - *gid = cache->table[index]; + mutex_lock(&table->lock); - read_unlock_irqrestore(&device->cache.lock, flags); + ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_NETDEV); + if (ix >= 0) + goto out_unlock; + + ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_DEFAULT); + if (ix < 0) { + ret = -ENOSPC; + goto out_unlock; + } + add_gid(ib_dev, port, table, ix, gid, attr, false); + +out_unlock: + mutex_unlock(&table->lock); return ret; } -EXPORT_SYMBOL(ib_get_cached_gid); -int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, - u8 *port_num, - u16 *index) +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + + ix = find_gid(table, gid, attr, false, + GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_NETDEV | + GID_ATTR_FIND_MASK_DEFAULT); + if (ix < 0) + goto out_unlock; + + del_gid(ib_dev, port, table, ix, false); + +out_unlock: + mutex_unlock(&table->lock); + return 0; +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev) { - struct ib_gid_cache *cache; + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + + for (ix = 0; ix < table->sz; ix++) + if (table->data_vec[ix].attr.ndev == ndev) + del_gid(ib_dev, port, table, ix, false); + + mutex_unlock(&table->lock); + return 0; +} + +static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; unsigned long flags; - int p, i; - int ret = -ENOENT; - *port_num = -1; + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (index < 0 || index >= table->sz) + return -EINVAL; + + read_lock_irqsave(&table->data_vec[index].lock, flags); + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) { + read_unlock_irqrestore(&table->data_vec[index].lock, flags); + return -EAGAIN; + } + + memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); + if (attr) { + memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); + if (attr->ndev) + dev_hold(attr->ndev); + } + + read_unlock_irqrestore(&table->data_vec[index].lock, flags); + return 0; +} + +static int _ib_cache_gid_table_find(struct ib_device *ib_dev, + const union ib_gid *gid, + const struct ib_gid_attr *val, + unsigned long mask, + u8 *port, u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + u8 p; + int local_index; + + for (p = 0; p < ib_dev->phys_port_cnt; p++) { + table = ports_table[p]; + local_index = find_gid(table, gid, val, false, mask); + if (local_index >= 0) { + if (index) + *index = local_index; + if (port) + *port = p + rdma_start_port(ib_dev); + return 0; + } + } + + return -ENOENT; +} + +static int ib_cache_gid_find(struct ib_device *ib_dev, + const union ib_gid *gid, + struct net_device *ndev, u8 *port, + u16 *index) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID; + struct ib_gid_attr gid_attr_val = {.ndev = ndev}; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, + mask, port, index); +} + +int ib_find_cached_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + u8 port, struct net_device *ndev, + u16 *index) +{ + int local_index; + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned long mask = GID_ATTR_FIND_MASK_GID; + struct ib_gid_attr val = {.ndev = ndev}; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev)) + return -ENOENT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + local_index = find_gid(table, gid, &val, false, mask); + if (local_index >= 0) { + if (index) + *index = local_index; + return 0; + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_cached_gid_by_port); + +/** + * ib_find_gid_by_filter - Returns the GID table index where a specified + * GID value occurs + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value could be + * searched. + * @filter: The filter function is executed on any matching GID in the table. + * If the filter function returns true, the corresponding index is returned, + * otherwise, we continue searching the GID table. It's guaranteed that + * while filter is executed, ndev field is valid and the structure won't + * change. filter is executed in an atomic context. filter must not be NULL. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_cache_gid_find_by_filter() searches for the specified GID value + * of which the filter function returns true in the port's GID table. + * This function is only supported on RoCE ports. + * + */ +static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, + const union ib_gid *gid, + u8 port, + bool (*filter)(const union ib_gid *, + const struct ib_gid_attr *, + void *), + void *context, + u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned int i; + bool found = false; + + if (!ports_table) + return -EOPNOTSUPP; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev) || + !rdma_protocol_roce(ib_dev, port)) + return -EPROTONOSUPPORT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + for (i = 0; i < table->sz; i++) { + struct ib_gid_attr attr; + unsigned long flags; + + read_lock_irqsave(&table->data_vec[i].lock, flags); + if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + goto next; + + if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + goto next; + + memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); + + if (filter(gid, &attr, context)) + found = true; + +next: + read_unlock_irqrestore(&table->data_vec[i].lock, flags); + + if (found) + break; + } + + if (!found) + return -ENOENT; + if (index) - *index = -1; + *index = i; + return 0; +} - read_lock_irqsave(&device->cache.lock, flags); +static struct ib_gid_table *alloc_gid_table(int sz) +{ + unsigned int i; + struct ib_gid_table *table = + kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + if (!table) + return NULL; - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - cache = device->cache.gid_cache[p]; - for (i = 0; i < cache->table_len; ++i) { - if (!memcmp(gid, &cache->table[i], sizeof *gid)) { - *port_num = p + start_port(device); - if (index) - *index = i; - ret = 0; - goto found; - } + table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); + if (!table->data_vec) + goto err_free_table; + + mutex_init(&table->lock); + + table->sz = sz; + + for (i = 0; i < sz; i++) + rwlock_init(&table->data_vec[i].lock); + + return table; + +err_free_table: + kfree(table); + return NULL; +} + +static void release_gid_table(struct ib_gid_table *table) +{ + if (table) { + kfree(table->data_vec); + kfree(table); + } +} + +static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + int i; + + if (!table) + return; + + for (i = 0; i < table->sz; ++i) { + if (memcmp(&table->data_vec[i].gid, &zgid, + sizeof(table->data_vec[i].gid))) + del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT); + } +} + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + enum ib_cache_gid_default_mode mode) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + union ib_gid gid; + struct ib_gid_attr gid_attr; + struct ib_gid_table *table; + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + make_default_gid(ndev, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + mutex_lock(&table->lock); + ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); + + /* Coudn't find default GID location */ + WARN_ON(ix < 0); + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto unlock; + + if ((memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr, + sizeof(current_gid_attr))) && + del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto unlock; + } + + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + +unlock: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + mutex_unlock(&table->lock); +} + +static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + if (rdma_protocol_roce(ib_dev, port)) { + struct ib_gid_table_entry *entry = &table->data_vec[0]; + + entry->props |= GID_TABLE_ENTRY_DEFAULT; + } + + return 0; +} + +static int _gid_table_setup_one(struct ib_device *ib_dev) +{ + u8 port; + struct ib_gid_table **table; + int err = 0; + + table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); + + if (!table) { + pr_warn("failed to allocate ib gid cache for %s\n", + ib_dev->name); + return -ENOMEM; + } + + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + u8 rdma_port = port + rdma_start_port(ib_dev); + + table[port] = + alloc_gid_table( + ib_dev->port_immutable[rdma_port].gid_tbl_len); + if (!table[port]) { + err = -ENOMEM; + goto rollback_table_setup; } + + err = gid_table_reserve_default(ib_dev, + port + rdma_start_port(ib_dev), + table[port]); + if (err) + goto rollback_table_setup; } -found: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; + ib_dev->cache.gid_cache = table; + return 0; + +rollback_table_setup: + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); + release_gid_table(table[port]); + } + + kfree(table); + return err; +} + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + release_gid_table(table[port]); + + kfree(table); + ib_dev->cache.gid_cache = NULL; +} + +static void gid_table_cleanup_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); +} + +static int gid_table_setup_one(struct ib_device *ib_dev) +{ + int err; + + err = _gid_table_setup_one(ib_dev); + + if (err) + return err; + + err = roce_rescan_device(ib_dev); + + if (err) { + gid_table_cleanup_one(ib_dev); + gid_table_release_one(ib_dev); + } + + return err; +} + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid, + struct ib_gid_attr *gid_attr) +{ + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + return __ib_cache_gid_get(device, port_num, index, gid, gid_attr); +} +EXPORT_SYMBOL(ib_get_cached_gid); + +int ib_find_cached_gid(struct ib_device *device, + const union ib_gid *gid, + struct net_device *ndev, + u8 *port_num, + u16 *index) +{ + return ib_cache_gid_find(device, gid, ndev, port_num, index); } EXPORT_SYMBOL(ib_find_cached_gid); +int ib_find_gid_by_filter(struct ib_device *device, + const union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index) +{ + /* Only RoCE GID table supports filter function */ + if (!rdma_cap_roce_gid_table(device, port_num) && filter) + return -EPROTONOSUPPORT; + + return ib_cache_gid_find_by_filter(device, gid, + port_num, filter, + context, index); +} +EXPORT_SYMBOL(ib_find_gid_by_filter); + int ib_get_cached_pkey(struct ib_device *device, u8 port_num, int index, @@ -140,12 +772,12 @@ int ib_get_cached_pkey(struct ib_device *device, unsigned long flags; int ret = 0; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - start_port(device)]; + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; if (index < 0 || index >= cache->table_len) ret = -EINVAL; @@ -169,12 +801,12 @@ int ib_find_cached_pkey(struct ib_device *device, int ret = -ENOENT; int partial_ix = -1; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - start_port(device)]; + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; *index = -1; @@ -209,12 +841,12 @@ int ib_find_exact_cached_pkey(struct ib_device *device, int i; int ret = -ENOENT; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - start_port(device)]; + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; *index = -1; @@ -238,11 +870,11 @@ int ib_get_cached_lmc(struct ib_device *device, unsigned long flags; int ret = 0; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)]; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -254,9 +886,21 @@ static void ib_cache_update(struct ib_device *device, { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; + struct ib_gid_cache { + int table_len; + union ib_gid table[0]; + } *gid_cache = NULL; int i; int ret; + struct ib_gid_table *table; + struct ib_gid_table **ports_table = device->cache.gid_cache; + bool use_roce_gid_table = + rdma_cap_roce_gid_table(device, port); + + if (port < rdma_start_port(device) || port > rdma_end_port(device)) + return; + + table = ports_table[port - rdma_start_port(device)]; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) @@ -276,12 +920,14 @@ static void ib_cache_update(struct ib_device *device, pkey_cache->table_len = tprops->pkey_tbl_len; - gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * - sizeof *gid_cache->table, GFP_KERNEL); - if (!gid_cache) - goto err; + if (!use_roce_gid_table) { + gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * + sizeof(*gid_cache->table), GFP_KERNEL); + if (!gid_cache) + goto err; - gid_cache->table_len = tprops->gid_tbl_len; + gid_cache->table_len = tprops->gid_tbl_len; + } for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); @@ -292,29 +938,36 @@ static void ib_cache_update(struct ib_device *device, } } - for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, gid_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; + if (!use_roce_gid_table) { + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, + gid_cache->table + i, NULL); + if (ret) { + printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } } } write_lock_irq(&device->cache.lock); - old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; - old_gid_cache = device->cache.gid_cache [port - start_port(device)]; + old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; - device->cache.pkey_cache[port - start_port(device)] = pkey_cache; - device->cache.gid_cache [port - start_port(device)] = gid_cache; + device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; + if (!use_roce_gid_table) { + for (i = 0; i < gid_cache->table_len; i++) { + modify_gid(device, port, table, i, gid_cache->table + i, + &zattr, false); + } + } - device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; + device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; write_unlock_irq(&device->cache.lock); + kfree(gid_cache); kfree(old_pkey_cache); - kfree(old_gid_cache); kfree(tprops); return; @@ -355,85 +1008,88 @@ static void ib_cache_event(struct ib_event_handler *handler, } } -static void ib_cache_setup_one(struct ib_device *device) +int ib_cache_setup_one(struct ib_device *device) { int p; + int err; rwlock_init(&device->cache.lock); device->cache.pkey_cache = - kmalloc(sizeof *device->cache.pkey_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - device->cache.gid_cache = - kmalloc(sizeof *device->cache.gid_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - + kzalloc(sizeof *device->cache.pkey_cache * + (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * - (end_port(device) - - start_port(device) + 1), + (rdma_end_port(device) - + rdma_start_port(device) + 1), GFP_KERNEL); - - if (!device->cache.pkey_cache || !device->cache.gid_cache || + if (!device->cache.pkey_cache || !device->cache.lmc_cache) { printk(KERN_WARNING "Couldn't allocate cache " "for %s\n", device->name); - goto err; + return -ENOMEM; } - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - device->cache.pkey_cache[p] = NULL; - device->cache.gid_cache [p] = NULL; - ib_cache_update(device, p + start_port(device)); - } + err = gid_table_setup_one(device); + if (err) + /* Allocated memory will be cleaned in the release function */ + return err; + + for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) + ib_cache_update(device, p + rdma_start_port(device)); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); - if (ib_register_event_handler(&device->cache.event_handler)) - goto err_cache; - - return; + err = ib_register_event_handler(&device->cache.event_handler); + if (err) + goto err; -err_cache: - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } + return 0; err: - kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); - kfree(device->cache.lmc_cache); + gid_table_cleanup_one(device); + return err; } -static void ib_cache_cleanup_one(struct ib_device *device) +void ib_cache_release_one(struct ib_device *device) { int p; - ib_unregister_event_handler(&device->cache.event_handler); - flush_workqueue(ib_wq); - - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } - + /* + * The release function frees all the cache elements. + * This function should be called as part of freeing + * all the device's resources when the cache could no + * longer be accessed. + */ + if (device->cache.pkey_cache) + for (p = 0; + p <= rdma_end_port(device) - rdma_start_port(device); ++p) + kfree(device->cache.pkey_cache[p]); + + gid_table_release_one(device); kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); kfree(device->cache.lmc_cache); } -static struct ib_client cache_client = { - .name = "cache", - .add = ib_cache_setup_one, - .remove = ib_cache_cleanup_one -}; +void ib_cache_cleanup_one(struct ib_device *device) +{ + /* The cleanup function unregisters the event handler, + * waits for all in-progress workqueue elements and cleans + * up the GID cache. This function should be called after + * the device was removed from the devices list and all + * clients were removed, so the cache exists but is + * non-functional and shouldn't be updated anymore. + */ + ib_unregister_event_handler(&device->cache.event_handler); + flush_workqueue(ib_wq); + gid_table_cleanup_one(device); +} -int __init ib_cache_setup(void) +void __init ib_cache_setup(void) { - return ib_register_client(&cache_client); + roce_gid_mgmt_init(); } void __exit ib_cache_cleanup(void) { - ib_unregister_client(&cache_client); + roce_gid_mgmt_cleanup(); } diff --git a/kernel/drivers/infiniband/core/cm.c b/kernel/drivers/infiniband/core/cm.c index 0271608a5..d6d2b3582 100644 --- a/kernel/drivers/infiniband/core/cm.c +++ b/kernel/drivers/infiniband/core/cm.c @@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); static void cm_add_one(struct ib_device *device); -static void cm_remove_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device, void *client_data); static struct ib_client cm_client = { .name = "cm", @@ -169,6 +169,7 @@ struct cm_device { struct ib_device *ib_device; struct device *device; u8 ack_delay; + int going_down; struct cm_port *port[0]; }; @@ -178,8 +179,6 @@ struct cm_av { struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; - u8 valid; - u8 smac[ETH_ALEN]; }; struct cm_work { @@ -212,13 +211,15 @@ struct cm_id_private { spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; atomic_t refcount; + /* Number of clients sharing this ib_cm_id. Only valid for listeners. + * Protected by the cm.lock spinlock. */ + int listen_sharecount; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; - struct ib_cm_compare_data *compare_data; void *private_data; __be64 tid; @@ -267,7 +268,8 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, cm_id_priv->av.pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); return PTR_ERR(m); @@ -297,7 +299,8 @@ static int cm_alloc_response_msg(struct cm_port *port, m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); return PTR_ERR(m); @@ -356,17 +359,21 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) unsigned long flags; int ret; u8 p; + struct net_device *ndev = ib_get_ndev_from_path(path); read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - &p, NULL)) { + ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); + if (ndev) + dev_put(ndev); + if (!port) return -EINVAL; @@ -379,9 +386,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; - memcpy(av->smac, path->smac, sizeof(av->smac)); - av->valid = 1; return 0; } @@ -437,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) return cm_id_priv; } -static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask) -{ - int i; - - for (i = 0; i < IB_CM_COMPARE_SIZE; i++) - dst[i] = src[i] & mask[i]; -} - -static int cm_compare_data(struct ib_cm_compare_data *src_data, - struct ib_cm_compare_data *dst_data) -{ - u32 src[IB_CM_COMPARE_SIZE]; - u32 dst[IB_CM_COMPARE_SIZE]; - - if (!src_data || !dst_data) - return 0; - - cm_mask_copy(src, src_data->data, dst_data->mask); - cm_mask_copy(dst, dst_data->data, src_data->mask); - return memcmp(src, dst, sizeof(src)); -} - -static int cm_compare_private_data(u32 *private_data, - struct ib_cm_compare_data *dst_data) -{ - u32 src[IB_CM_COMPARE_SIZE]; - - if (!dst_data) - return 0; - - cm_mask_copy(src, private_data, dst_data->mask); - return memcmp(src, dst_data->data, sizeof(src)); -} - /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable @@ -503,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; __be64 service_mask = cm_id_priv->id.service_mask; - int data_cmp; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); - data_cmp = cm_compare_data(cm_id_priv->compare_data, - cur_cm_id_priv->compare_data); if ((cur_cm_id_priv->id.service_mask & service_id) == (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device) && - !data_cmp) + (cm_id_priv->id.device == cur_cm_id_priv->id.device)) return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) @@ -525,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; - else if (data_cmp < 0) - link = &(*link)->rb_left; else link = &(*link)->rb_right; } @@ -536,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) } static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id, - u32 *private_data) + __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; - int data_cmp; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); - data_cmp = cm_compare_private_data(private_data, - cm_id_priv->compare_data); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && - (cm_id_priv->id.device == device) && !data_cmp) + (cm_id_priv->id.device == device)) return cm_id_priv; if (device < cm_id_priv->id.device) @@ -560,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; - else if (data_cmp < 0) - node = node->rb_left; else node = node->rb_right; } @@ -803,6 +762,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) { int wait_time; unsigned long flags; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) + return; spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); @@ -816,8 +780,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); - queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, - msecs_to_jiffies(wait_time)); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->timewait_info = NULL; } @@ -845,9 +815,15 @@ retest: spin_lock_irq(&cm_id_priv->lock); switch (cm_id->state) { case IB_CM_LISTEN: - cm_id->state = IB_CM_IDLE; spin_unlock_irq(&cm_id_priv->lock); + spin_lock_irq(&cm.lock); + if (--cm_id_priv->listen_sharecount > 0) { + /* The id is still shared. */ + cm_deref_id(cm_id_priv); + spin_unlock_irq(&cm.lock); + return; + } rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); spin_unlock_irq(&cm.lock); break; @@ -859,6 +835,11 @@ retest: case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + spin_lock_irq(&cm.lock); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, + &cm.remote_sidr_table); + spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -916,7 +897,6 @@ retest: wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - kfree(cm_id_priv->compare_data); kfree(cm_id_priv->private_data); kfree(cm_id_priv); } @@ -927,11 +907,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id) } EXPORT_SYMBOL(ib_destroy_cm_id); -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, - struct ib_cm_compare_data *compare_data) +/** + * __ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + */ +static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, + __be64 service_mask) { struct cm_id_private *cm_id_priv, *cur_cm_id_priv; - unsigned long flags; int ret = 0; service_mask = service_mask ? service_mask : ~cpu_to_be64(0); @@ -944,20 +936,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, if (cm_id->state != IB_CM_IDLE) return -EINVAL; - if (compare_data) { - cm_id_priv->compare_data = kzalloc(sizeof *compare_data, - GFP_KERNEL); - if (!cm_id_priv->compare_data) - return -ENOMEM; - cm_mask_copy(cm_id_priv->compare_data->data, - compare_data->data, compare_data->mask); - memcpy(cm_id_priv->compare_data->mask, compare_data->mask, - sizeof(compare_data->mask)); - } - cm_id->state = IB_CM_LISTEN; + ++cm_id_priv->listen_sharecount; - spin_lock_irqsave(&cm.lock, flags); if (service_id == IB_CM_ASSIGN_SERVICE_ID) { cm_id->service_id = cpu_to_be64(cm.listen_service_id++); cm_id->service_mask = ~cpu_to_be64(0); @@ -966,18 +947,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, cm_id->service_mask = service_mask; } cur_cm_id_priv = cm_insert_listen(cm_id_priv); - spin_unlock_irqrestore(&cm.lock, flags); if (cur_cm_id_priv) { cm_id->state = IB_CM_IDLE; - kfree(cm_id_priv->compare_data); - cm_id_priv->compare_data = NULL; + --cm_id_priv->listen_sharecount; ret = -EBUSY; } return ret; } + +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm.lock, flags); + ret = __ib_cm_listen(cm_id, service_id, service_mask); + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; +} EXPORT_SYMBOL(ib_cm_listen); +/** + * Create a new listening ib_cm_id and listen on the given service ID. + * + * If there's an existing ID listening on that same device and service ID, + * return it. + * + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * + * Callers should call ib_destroy_cm_id when done with the listener ID. + */ +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id) +{ + struct cm_id_private *cm_id_priv; + struct ib_cm_id *cm_id; + unsigned long flags; + int err = 0; + + /* Create an ID in advance, since the creation may sleep */ + cm_id = ib_create_cm_id(device, cm_handler, NULL); + if (IS_ERR(cm_id)) + return cm_id; + + spin_lock_irqsave(&cm.lock, flags); + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) + goto new_id; + + /* Find an existing ID */ + cm_id_priv = cm_find_listen(device, service_id); + if (cm_id_priv) { + if (cm_id->cm_handler != cm_handler || cm_id->context) { + /* Sharing an ib_cm_id with different handlers is not + * supported */ + spin_unlock_irqrestore(&cm.lock, flags); + return ERR_PTR(-EINVAL); + } + atomic_inc(&cm_id_priv->refcount); + ++cm_id_priv->listen_sharecount; + spin_unlock_irqrestore(&cm.lock, flags); + + ib_destroy_cm_id(cm_id); + cm_id = &cm_id_priv->id; + return cm_id; + } + +new_id: + /* Use newly created ID */ + err = __ib_cm_listen(cm_id, service_id, 0); + + spin_unlock_irqrestore(&cm.lock, flags); + + if (err) { + ib_destroy_cm_id(cm_id); + return ERR_PTR(err); + } + return cm_id; +} +EXPORT_SYMBOL(ib_cm_insert_listen); + static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, enum cm_msg_sequence msg_seq) { @@ -1254,6 +1312,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->packet_life_time = cm_req_get_primary_local_ack_timeout(req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); + primary_path->service_id = req_msg->service_id; if (req_msg->alt_local_lid) { memset(alt_path, 0, sizeof *alt_path); @@ -1275,7 +1334,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, alt_path->packet_life_time = cm_req_get_alt_local_ack_timeout(req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); + alt_path->service_id = req_msg->service_id; + } +} + +static u16 cm_get_bth_pkey(struct cm_work *work) +{ + struct ib_device *ib_dev = work->port->cm_dev->ib_device; + u8 port_num = work->port->port_num; + u16 pkey_index = work->mad_recv_wc->wc->pkey_index; + u16 pkey; + int ret; + + ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); + if (ret) { + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", + port_num, pkey_index, ret); + return 0; } + + return pkey; } static void cm_format_req_event(struct cm_work *work, @@ -1288,6 +1366,7 @@ static void cm_format_req_event(struct cm_work *work, req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; if (req_msg->alt_local_lid) @@ -1470,8 +1549,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, - req_msg->service_id, - req_msg->private_data); + req_msg->service_id); if (!listen_cm_id_priv) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); @@ -1561,11 +1639,11 @@ static int cm_req_handler(struct cm_work *work) cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); - work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid); + work->port->port_num, 0, &work->path[0].sgid, + NULL); ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); @@ -2978,6 +3056,8 @@ static void cm_format_sidr_req_event(struct cm_work *work, param = &work->cm_event.param.sidr_req_rcvd; param->pkey = __be16_to_cpu(sidr_req_msg->pkey); param->listen_id = listen_id; + param->service_id = sidr_req_msg->service_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; work->cm_event.private_data = &sidr_req_msg->private_data; } @@ -3017,8 +3097,7 @@ static int cm_sidr_req_handler(struct cm_work *work) } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; cur_cm_id_priv = cm_find_listen(cm_id->device, - sidr_req_msg->service_id, - sidr_req_msg->private_data); + sidr_req_msg->service_id); if (!cur_cm_id_priv) { spin_unlock_irq(&cm.lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); @@ -3098,7 +3177,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } spin_unlock_irqrestore(&cm.lock, flags); return 0; @@ -3303,6 +3385,11 @@ static int cm_establish(struct ib_cm_id *cm_id) struct cm_work *work; unsigned long flags; int ret = 0; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) + return -ENODEV; work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) @@ -3341,7 +3428,17 @@ static int cm_establish(struct ib_cm_id *cm_id) work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!cm_dev->going_down) { + queue_delayed_work(cm.wq, &work->work, 0); + } else { + kfree(work); + ret = -ENODEV; + } + spin_unlock_irq(&cm.lock); + out: return ret; } @@ -3392,6 +3489,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, enum ib_cm_event_type event; u16 attr_id; int paths = 0; + int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: @@ -3450,7 +3548,19 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, @@ -3508,32 +3618,6 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; - if (!cm_id_priv->av.valid) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return -EINVAL; - } - if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { - qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_VID; - } - if (!is_zero_ether_addr(cm_id_priv->av.smac)) { - memcpy(qp_attr->smac, cm_id_priv->av.smac, - sizeof(qp_attr->smac)); - *qp_attr_mask |= IB_QP_SMAC; - } - if (cm_id_priv->alt_av.valid) { - if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { - qp_attr->alt_vlan_id = - cm_id_priv->alt_av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_ALT_VID; - } - if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { - memcpy(qp_attr->alt_smac, - cm_id_priv->alt_av.smac, - sizeof(qp_attr->alt_smac)); - *qp_attr_mask |= IB_QP_ALT_SMAC; - } - } qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -3759,11 +3843,9 @@ static void cm_add_one(struct ib_device *ib_device) }; unsigned long flags; int ret; + int count = 0; u8 i; - if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB) - return; - cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * ib_device->phys_port_cnt, GFP_KERNEL); if (!cm_dev) @@ -3771,7 +3853,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->ib_device = ib_device; cm_get_ack_delay(cm_dev); - + cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); @@ -3782,6 +3864,9 @@ static void cm_add_one(struct ib_device *ib_device) set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; @@ -3808,7 +3893,13 @@ static void cm_add_one(struct ib_device *ib_device) ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) goto error3; + + count++; } + + if (!count) + goto free; + ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); @@ -3824,18 +3915,22 @@ error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } +free: device_unregister(cm_dev->device); kfree(cm_dev); } -static void cm_remove_one(struct ib_device *ib_device) +static void cm_remove_one(struct ib_device *ib_device, void *client_data) { - struct cm_device *cm_dev; + struct cm_device *cm_dev = client_data; struct cm_port *port; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP @@ -3843,7 +3938,6 @@ static void cm_remove_one(struct ib_device *ib_device) unsigned long flags; int i; - cm_dev = ib_get_client_data(ib_device, &cm_client); if (!cm_dev) return; @@ -3851,11 +3945,23 @@ static void cm_remove_one(struct ib_device *ib_device) list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - ib_unregister_mad_agent(port->mad_agent); + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ flush_workqueue(cm.wq); + ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } device_unregister(cm_dev->device); diff --git a/kernel/drivers/infiniband/core/cma.c b/kernel/drivers/infiniband/core/cma.c index 38ffe0981..17a15c560 100644 --- a/kernel/drivers/infiniband/core/cma.c +++ b/kernel/drivers/infiniband/core/cma.c @@ -44,8 +44,12 @@ #include <linux/module.h> #include <net/route.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/tcp.h> #include <net/ipv6.h> +#include <net/ip_fib.h> +#include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> @@ -65,8 +69,36 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 +static const char * const cma_events[] = { + [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", + [RDMA_CM_EVENT_ADDR_ERROR] = "address error", + [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", + [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", + [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", + [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", + [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", + [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", + [RDMA_CM_EVENT_REJECTED] = "rejected", + [RDMA_CM_EVENT_ESTABLISHED] = "established", + [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", + [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", + [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", + [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", + [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", + [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", +}; + +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? + cma_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(rdma_event_msg); + static void cma_add_one(struct ib_device *device); -static void cma_remove_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", @@ -80,10 +112,37 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static DEFINE_IDR(tcp_ps); -static DEFINE_IDR(udp_ps); -static DEFINE_IDR(ipoib_ps); -static DEFINE_IDR(ib_ps); +static int cma_pernet_id; + +struct cma_pernet { + struct idr tcp_ps; + struct idr udp_ps; + struct idr ipoib_ps; + struct idr ib_ps; +}; + +static struct cma_pernet *cma_pernet(struct net *net) +{ + return net_generic(net, cma_pernet_id); +} + +static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps) +{ + struct cma_pernet *pernet = cma_pernet(net); + + switch (ps) { + case RDMA_PS_TCP: + return &pernet->tcp_ps; + case RDMA_PS_UDP: + return &pernet->udp_ps; + case RDMA_PS_IPOIB: + return &pernet->ipoib_ps; + case RDMA_PS_IB: + return &pernet->ib_ps; + default: + return NULL; + } +} struct cma_device { struct list_head list; @@ -94,11 +153,34 @@ struct cma_device { }; struct rdma_bind_list { - struct idr *ps; + enum rdma_port_space ps; struct hlist_head owners; unsigned short port; }; +static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, + struct rdma_bind_list *bind_list, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); +} + +static struct rdma_bind_list *cma_ps_find(struct net *net, + enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + return idr_find(idr, snum); +} + +static void cma_ps_remove(struct net *net, enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + idr_remove(idr, snum); +} + enum { CMA_OPTION_AFONLY, }; @@ -197,6 +279,15 @@ struct cma_hdr { #define CMA_VERSION 0x00 +struct cma_req_info { + struct ib_device *device; + int port; + union ib_gid local_gid; + __be64 service_id; + u16 pkey; + bool has_gid:1; +}; + static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; @@ -234,7 +325,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, return old; } -static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } @@ -349,18 +440,40 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a return ret; } +static inline int cma_validate_port(struct ib_device *device, u8 port, + union ib_gid *gid, int dev_type, + int bound_if_index) +{ + int ret = -ENODEV; + struct net_device *ndev = NULL; + + if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) + return ret; + + if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) + return ret; + + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) + ndev = dev_get_by_index(&init_net, bound_if_index); + + ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL); + + if (ndev) + dev_put(ndev); + + return ret; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv, struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid; + union ib_gid gid, iboe_gid, *gidp; int ret = -ENODEV; - u8 port, found_port; - enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + u8 port; - if (dev_ll != IB_LINK_LAYER_INFINIBAND && + if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; @@ -370,41 +483,38 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); - if (listen_id_priv && - rdma_port_get_link_layer(listen_id_priv->id.device, - listen_id_priv->id.port_num) == dev_ll) { + + if (listen_id_priv) { cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, - &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, - &found_port, NULL); - - if (!ret && (port == found_port)) { - id_priv->id.port_num = found_port; + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + + ret = cma_validate_port(cma_dev->device, port, gidp, + dev_addr->dev_type, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; goto out; } } + list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (listen_id_priv && listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - - if (!ret && (port == found_port)) { - id_priv->id.port_num = found_port; - goto out; - } + + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + + ret = cma_validate_port(cma_dev->device, port, gidp, + dev_addr->dev_type, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; + goto out; } } } @@ -435,14 +545,16 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) pkey = ntohs(addr->sib_pkey); list_for_each_entry(cur_dev, &dev_list, list) { - if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) - continue; - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!rdma_cap_af_ib(cur_dev->device, p)) + continue; + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; - for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) { + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, + &gid, NULL); + i++) { if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; @@ -488,7 +600,8 @@ static int cma_disable_callback(struct rdma_id_private *id_priv, return 0; } -struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, +struct rdma_cm_id *rdma_create_id(struct net *net, + rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { @@ -512,6 +625,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + id_priv->id.route.addr.dev_addr.net = get_net(net); return &id_priv->id; } @@ -629,19 +743,12 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, goto out; ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - qp_attr.ah_attr.grh.sgid_index, &sgid); + qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); if (ret) goto out; - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) - == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) - == IB_LINK_LAYER_ETHERNET) { - ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); + BUG_ON(id_priv->cma_dev->device != id_priv->id.device); - if (ret) - goto out; - } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); @@ -700,11 +807,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, int ret; u16 pkey; - if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == - IB_LINK_LAYER_INFINIBAND) - pkey = ib_addr_get_pkey(dev_addr); - else + if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; + else + pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); @@ -735,8 +841,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else @@ -745,19 +850,15 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); - break; - default: + } else ret = -ENOSYS; - break; - } return ret; } @@ -837,107 +938,419 @@ static inline int cma_any_port(struct sockaddr *addr) return !cma_port(addr); } -static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, +static void cma_save_ib_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, struct ib_sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; - ib = (struct sockaddr_ib *) &id->route.addr.src_addr; - ib->sib_family = listen_ib->sib_family; - if (path) { - ib->sib_pkey = path->pkey; - ib->sib_flowinfo = path->flow_label; - memcpy(&ib->sib_addr, &path->sgid, 16); - } else { - ib->sib_pkey = listen_ib->sib_pkey; - ib->sib_flowinfo = listen_ib->sib_flowinfo; - ib->sib_addr = listen_ib->sib_addr; + if (src_addr) { + ib = (struct sockaddr_ib *)src_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = path->service_id; + ib->sib_scope_id = 0; + } else { + ib->sib_pkey = listen_ib->sib_pkey; + ib->sib_flowinfo = listen_ib->sib_flowinfo; + ib->sib_addr = listen_ib->sib_addr; + ib->sib_sid = listen_ib->sib_sid; + ib->sib_scope_id = listen_ib->sib_scope_id; + } + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } - ib->sib_sid = listen_ib->sib_sid; - ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); - ib->sib_scope_id = listen_ib->sib_scope_id; - - if (path) { - ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; - ib->sib_family = listen_ib->sib_family; - ib->sib_pkey = path->pkey; - ib->sib_flowinfo = path->flow_label; - memcpy(&ib->sib_addr, &path->dgid, 16); + if (dst_addr) { + ib = (struct sockaddr_ib *)dst_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); + } } } -static __be16 ss_get_port(const struct sockaddr_storage *ss) -{ - if (ss->ss_family == AF_INET) - return ((struct sockaddr_in *)ss)->sin_port; - else if (ss->ss_family == AF_INET6) - return ((struct sockaddr_in6 *)ss)->sin6_port; - BUG(); -} - -static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, - struct cma_hdr *hdr) +static void cma_save_ip4_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) { struct sockaddr_in *ip4; - ip4 = (struct sockaddr_in *) &id->route.addr.src_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; - ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr); + if (src_addr) { + ip4 = (struct sockaddr_in *)src_addr; + ip4->sin_family = AF_INET; + ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; + ip4->sin_port = local_port; + } - ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; - ip4->sin_port = hdr->port; + if (dst_addr) { + ip4 = (struct sockaddr_in *)dst_addr; + ip4->sin_family = AF_INET; + ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; + ip4->sin_port = hdr->port; + } } -static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, - struct cma_hdr *hdr) +static void cma_save_ip6_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) { struct sockaddr_in6 *ip6; - ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->dst_addr.ip6; - ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr); + if (src_addr) { + ip6 = (struct sockaddr_in6 *)src_addr; + ip6->sin6_family = AF_INET6; + ip6->sin6_addr = hdr->dst_addr.ip6; + ip6->sin6_port = local_port; + } - ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->src_addr.ip6; - ip6->sin6_port = hdr->port; + if (dst_addr) { + ip6 = (struct sockaddr_in6 *)dst_addr; + ip6->sin6_family = AF_INET6; + ip6->sin6_addr = hdr->src_addr.ip6; + ip6->sin6_port = hdr->port; + } } -static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) +static u16 cma_port_from_service_id(__be64 service_id) { - struct cma_hdr *hdr; + return (u16)be64_to_cpu(service_id); +} - if (listen_id->route.addr.src_addr.ss_family == AF_IB) { - if (ib_event->event == IB_CM_REQ_RECEIVED) - cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); - else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) - cma_save_ib_info(id, listen_id, NULL); - return 0; - } +static int cma_save_ip_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct ib_cm_event *ib_event, + __be64 service_id) +{ + struct cma_hdr *hdr; + __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; + port = htons(cma_port_from_service_id(service_id)); + switch (cma_get_ip_ver(hdr)) { case 4: - cma_save_ip4_info(id, listen_id, hdr); + cma_save_ip4_info(src_addr, dst_addr, hdr, port); break; case 6: - cma_save_ip6_info(id, listen_id, hdr); + cma_save_ip6_info(src_addr, dst_addr, hdr, port); + break; + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int cma_save_net_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event, + sa_family_t sa_family, __be64 service_id) +{ + if (sa_family == AF_IB) { + if (ib_event->event == IB_CM_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, + ib_event->param.req_rcvd.primary_path); + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); + return 0; + } + + return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); +} + +static int cma_save_req_info(const struct ib_cm_event *ib_event, + struct cma_req_info *req) +{ + const struct ib_cm_req_event_param *req_param = + &ib_event->param.req_rcvd; + const struct ib_cm_sidr_req_event_param *sidr_param = + &ib_event->param.sidr_req_rcvd; + + switch (ib_event->event) { + case IB_CM_REQ_RECEIVED: + req->device = req_param->listen_id->device; + req->port = req_param->port; + memcpy(&req->local_gid, &req_param->primary_path->sgid, + sizeof(req->local_gid)); + req->has_gid = true; + req->service_id = req_param->primary_path->service_id; + req->pkey = be16_to_cpu(req_param->primary_path->pkey); + break; + case IB_CM_SIDR_REQ_RECEIVED: + req->device = sidr_param->listen_id->device; + req->port = sidr_param->port; + req->has_gid = false; + req->service_id = sidr_param->service_id; + req->pkey = sidr_param->pkey; break; default: return -EINVAL; } + return 0; } +static bool validate_ipv4_net_dev(struct net_device *net_dev, + const struct sockaddr_in *dst_addr, + const struct sockaddr_in *src_addr) +{ + __be32 daddr = dst_addr->sin_addr.s_addr, + saddr = src_addr->sin_addr.s_addr; + struct fib_result res; + struct flowi4 fl4; + int err; + bool ret; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || + ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || + ipv4_is_loopback(saddr)) + return false; + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_iif = net_dev->ifindex; + fl4.daddr = daddr; + fl4.saddr = saddr; + + rcu_read_lock(); + err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); + ret = err == 0 && FIB_RES_DEV(res) == net_dev; + rcu_read_unlock(); + + return ret; +} + +static bool validate_ipv6_net_dev(struct net_device *net_dev, + const struct sockaddr_in6 *dst_addr, + const struct sockaddr_in6 *src_addr) +{ +#if IS_ENABLED(CONFIG_IPV6) + const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, + &src_addr->sin6_addr, net_dev->ifindex, + strict); + bool ret; + + if (!rt) + return false; + + ret = rt->rt6i_idev->dev == net_dev; + ip6_rt_put(rt); + + return ret; +#else + return false; +#endif +} + +static bool validate_net_dev(struct net_device *net_dev, + const struct sockaddr *daddr, + const struct sockaddr *saddr) +{ + const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; + const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + + switch (daddr->sa_family) { + case AF_INET: + return saddr->sa_family == AF_INET && + validate_ipv4_net_dev(net_dev, daddr4, saddr4); + + case AF_INET6: + return saddr->sa_family == AF_INET6 && + validate_ipv6_net_dev(net_dev, daddr6, saddr6); + + default: + return false; + } +} + +static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, + const struct cma_req_info *req) +{ + struct sockaddr_storage listen_addr_storage, src_addr_storage; + struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, + *src_addr = (struct sockaddr *)&src_addr_storage; + struct net_device *net_dev; + const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; + int err; + + err = cma_save_ip_info(listen_addr, src_addr, ib_event, + req->service_id); + if (err) + return ERR_PTR(err); + + net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, + gid, listen_addr); + if (!net_dev) + return ERR_PTR(-ENODEV); + + if (!validate_net_dev(net_dev, listen_addr, src_addr)) { + dev_put(net_dev); + return ERR_PTR(-EHOSTUNREACH); + } + + return net_dev; +} + +static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +{ + return (be64_to_cpu(service_id) >> 16) & 0xffff; +} + +static bool cma_match_private_data(struct rdma_id_private *id_priv, + const struct cma_hdr *hdr) +{ + struct sockaddr *addr = cma_src_addr(id_priv); + __be32 ip4_addr; + struct in6_addr ip6_addr; + + if (cma_any_addr(addr) && !id_priv->afonly) + return true; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; + if (cma_get_ip_ver(hdr) != 4) + return false; + if (!cma_any_addr(addr) && + hdr->dst_addr.ip4.addr != ip4_addr) + return false; + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; + if (cma_get_ip_ver(hdr) != 6) + return false; + if (!cma_any_addr(addr) && + memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) + return false; + break; + case AF_IB: + return true; + default: + return false; + } + + return true; +} + +static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) +{ + enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); + enum rdma_transport_type transport = + rdma_node_get_transport(device->node_type); + + return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; +} + +static bool cma_protocol_roce(const struct rdma_cm_id *id) +{ + struct ib_device *device = id->device; + const int port_num = id->port_num ?: rdma_start_port(device); + + return cma_protocol_roce_dev_port(device, port_num); +} + +static bool cma_match_net_dev(const struct rdma_cm_id *id, + const struct net_device *net_dev, + u8 port_num) +{ + const struct rdma_addr *addr = &id->route.addr; + + if (!net_dev) + /* This request is an AF_IB request or a RoCE request */ + return (!id->port_num || id->port_num == port_num) && + (addr->src_addr.ss_family == AF_IB || + cma_protocol_roce_dev_port(id->device, port_num)); + + return !addr->dev_addr.bound_dev_if || + (net_eq(dev_net(net_dev), addr->dev_addr.net) && + addr->dev_addr.bound_dev_if == net_dev->ifindex); +} + +static struct rdma_id_private *cma_find_listener( + const struct rdma_bind_list *bind_list, + const struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + const struct cma_req_info *req, + const struct net_device *net_dev) +{ + struct rdma_id_private *id_priv, *id_priv_dev; + + if (!bind_list) + return ERR_PTR(-EINVAL); + + hlist_for_each_entry(id_priv, &bind_list->owners, node) { + if (cma_match_private_data(id_priv, ib_event->private_data)) { + if (id_priv->id.device == cm_id->device && + cma_match_net_dev(&id_priv->id, net_dev, req->port)) + return id_priv; + list_for_each_entry(id_priv_dev, + &id_priv->listen_list, + listen_list) { + if (id_priv_dev->id.device == cm_id->device && + cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + return id_priv_dev; + } + } + } + + return ERR_PTR(-EINVAL); +} + +static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, + struct ib_cm_event *ib_event, + struct net_device **net_dev) +{ + struct cma_req_info req; + struct rdma_bind_list *bind_list; + struct rdma_id_private *id_priv; + int err; + + err = cma_save_req_info(ib_event, &req); + if (err) + return ERR_PTR(err); + + *net_dev = cma_get_net_dev(ib_event, &req); + if (IS_ERR(*net_dev)) { + if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { + /* Assuming the protocol is AF_IB */ + *net_dev = NULL; + } else if (cma_protocol_roce_dev_port(req.device, req.port)) { + /* TODO find the net dev matching the request parameters + * through the RoCE GID table */ + *net_dev = NULL; + } else { + return ERR_CAST(*net_dev); + } + } + + bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, + rdma_ps_from_service_id(req.service_id), + cma_port_from_service_id(req.service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + if (IS_ERR(id_priv) && *net_dev) { + dev_put(*net_dev); + *net_dev = NULL; + } + + return id_priv; +} + static inline int cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); @@ -945,13 +1358,9 @@ static inline int cma_user_data_offset(struct rdma_id_private *id_priv) static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); - break; - default: - break; } } @@ -1002,6 +1411,7 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; + struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; @@ -1009,7 +1419,7 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { - idr_remove(bind_list->ps, bind_list->port); + cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); @@ -1023,17 +1433,12 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_mcast(id_priv->cma_dev->device, + id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: + } else kref_put(&mc->mcref, release_mc); - break; - default: - break; - } } } @@ -1054,17 +1459,12 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); @@ -1078,6 +1478,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) cma_deref_id(id_priv->id.context); kfree(id_priv->id.route.path_rec); + put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); @@ -1197,20 +1598,27 @@ out: } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + const __be64 service_id = + ib_event->param.req_rcvd.primary_path->service_id; int ret; - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; id_priv = container_of(id, struct rdma_id_private, id); - if (cma_save_net_info(id, listen_id, ib_event)) + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; @@ -1224,14 +1632,21 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - if (cma_any_addr(cma_src_addr(id_priv))) { - rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; - rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); - ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); - } else { - ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (net_dev) { + ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_protocol_roce(listen_id) && + cma_any_addr(cma_src_addr(id_priv))) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + } else if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (ret) + goto err; + } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); @@ -1244,25 +1659,38 @@ err: } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct net *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; id_priv = container_of(id, struct rdma_id_private, id); - if (cma_save_net_info(id, listen_id, ib_event)) + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, + ib_event->param.sidr_req_rcvd.service_id)) goto err; - if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { - ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); + if (net_dev) { + ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), + &id->route.addr.dev_addr); + if (ret) + goto err; + } } id_priv->state = RDMA_CM_CONNECT; @@ -1300,25 +1728,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; + struct net_device *net_dev; int offset, ret; - listen_id = cm_id->context; - if (!cma_check_req_qp_type(&listen_id->id, ib_event)) - return -EINVAL; + listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); + if (IS_ERR(listen_id)) + return PTR_ERR(listen_id); - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { + ret = -EINVAL; + goto net_dev_put; + } + + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) { + ret = -ECONNABORTED; + goto net_dev_put; + } memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - conn_id = cma_new_udp_id(&listen_id->id, ib_event); + conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { - conn_id = cma_new_conn_id(&listen_id->id, ib_event); + conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } @@ -1356,6 +1792,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); + if (net_dev) + dev_put(net_dev); return 0; err3: @@ -1369,6 +1807,11 @@ err1: mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); + +net_dev_put: + if (net_dev) + dev_put(net_dev); + return ret; } @@ -1381,42 +1824,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_get_service_id); -static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, - struct ib_cm_compare_data *compare) -{ - struct cma_hdr *cma_data, *cma_mask; - __be32 ip4_addr; - struct in6_addr ip6_addr; - - memset(compare, 0, sizeof *compare); - cma_data = (void *) compare->data; - cma_mask = (void *) compare->mask; - - switch (addr->sa_family) { - case AF_INET: - ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; - cma_set_ip_ver(cma_data, 4); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); - } - break; - case AF_INET6: - ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; - cma_set_ip_ver(cma_data, 6); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof cma_mask->dst_addr.ip6); - } - break; - default: - break; - } -} - static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; @@ -1498,7 +1905,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.event_handler, + new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { @@ -1570,33 +1978,18 @@ out: static int cma_ib_listen(struct rdma_id_private *id_priv) { - struct ib_cm_compare_data compare_data; struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; - int ret; - id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); + id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); - id_priv->cm_id.ib = id; - addr = cma_src_addr(id_priv); - svc_id = rdma_get_service_id(&id_priv->id, addr); - if (cma_any_addr(addr) && !id_priv->afonly) - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); - else { - cma_set_compare_data(id_priv->id.ps, addr, &compare_data); - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); - } - - if (ret) { - ib_destroy_cm_id(id_priv->cm_id.ib); - id_priv->cm_id.ib = NULL; - } - - return ret; + return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) @@ -1610,6 +2003,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) if (IS_ERR(id)) return PTR_ERR(id); + id->tos = id_priv->tos; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -1640,13 +2034,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; + struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; - if (cma_family(id_priv) == AF_IB && - rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB) + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; @@ -1925,16 +2319,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) + if (addr->dev_addr.bound_dev_if) { ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + route->path_rec->net = &init_net; + route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + } if (!ndev) { ret = -ENODEV; goto err2; } - route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); - memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); @@ -1984,26 +2379,15 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) return -EINVAL; atomic_inc(&id_priv->refcount); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_resolve_ib_route(id_priv, timeout_ms); - break; - case IB_LINK_LAYER_ETHERNET: - ret = cma_resolve_iboe_route(id_priv); - break; - default: - ret = -ENOSYS; - } - break; - case RDMA_TRANSPORT_IWARP: + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_route(id_priv, timeout_ms); + else if (rdma_protocol_roce(id->device, id->port_num)) + ret = cma_resolve_iboe_route(id_priv); + else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv, timeout_ms); - break; - default: + else ret = -ENOSYS; - break; - } + if (ret) goto err; @@ -2045,7 +2429,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && - rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) @@ -2068,7 +2452,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) p = 1; port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); if (ret) goto out; @@ -2077,7 +2461,7 @@ port_found: goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? + (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); @@ -2195,8 +2579,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; if (dst_addr->sa_family == AF_INET6) { - ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = - ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *) src_addr)->sib_pkey = ((struct sockaddr_ib *) dst_addr)->sib_pkey; @@ -2317,8 +2704,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, hlist_add_head(&id_priv->node, &bind_list->owners); } -static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, - unsigned short snum) +static int cma_alloc_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; @@ -2327,7 +2714,8 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, if (!bind_list) return -ENOMEM; - ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL); + ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, + snum); if (ret < 0) goto err; @@ -2340,18 +2728,20 @@ err: return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } -static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_alloc_any_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; + struct net *net = id_priv->id.route.addr.dev_addr.net; - inet_get_local_port_range(&init_net, &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; retry: if (last_used_port != rover && - !idr_find(ps, (unsigned short) rover)) { + !cma_ps_find(net, ps, (unsigned short)rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid @@ -2406,7 +2796,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list, return 0; } -static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_use_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; @@ -2416,7 +2807,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; - bind_list = idr_find(ps, snum); + bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { @@ -2439,25 +2830,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv) return ret; } -static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_inet_ps( + struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: - return &tcp_ps; case RDMA_PS_UDP: - return &udp_ps; case RDMA_PS_IPOIB: - return &ipoib_ps; case RDMA_PS_IB: - return &ib_ps; + return id_priv->id.ps; default: - return NULL; + + return 0; } } -static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { - struct idr *ps = NULL; + enum rdma_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; @@ -2467,15 +2857,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; - ps = &ib_ps; + ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; - ps = &tcp_ps; + ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; - ps = &udp_ps; + ps = RDMA_PS_UDP; } if (ps) { @@ -2488,7 +2878,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) static int cma_get_port(struct rdma_id_private *id_priv) { - struct idr *ps; + enum rdma_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) @@ -2554,18 +2944,15 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) id_priv->backlog = backlog; if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; - break; - default: + } else { ret = -ENOSYS; goto err; } @@ -2612,8 +2999,11 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) - else if (addr->sa_family == AF_INET6) - id_priv->afonly = init_net.ipv6.sysctl.bindv6only; + else if (addr->sa_family == AF_INET6) { + struct net *net = id_priv->id.route.addr.dev_addr.net; + + id_priv->afonly = net->ipv6.sysctl.bindv6only; + } #endif } ret = cma_get_port(id_priv); @@ -2857,6 +3247,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + cm_id->tos = id_priv->tos; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -2901,20 +3292,15 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_connect_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto err; @@ -3017,8 +3403,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, @@ -3034,14 +3419,10 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) else ret = cma_rep_recv(id_priv); } - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_accept_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto reject; @@ -3085,8 +3466,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); @@ -3094,15 +3474,12 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - break; - default: + } else ret = -ENOSYS; - break; - } + return ret; } EXPORT_SYMBOL(rdma_reject); @@ -3116,22 +3493,18 @@ int rdma_disconnect(struct rdma_cm_id *id) if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); - break; - default: + } else ret = -EINVAL; - break; - } + out: return ret; } @@ -3377,24 +3750,13 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_join_ib_multicast(id_priv, mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_init(&mc->mcref); - ret = cma_iboe_join_multicast(id_priv, mc); - break; - default: - ret = -EINVAL; - } - break; - default: + if (rdma_protocol_roce(id->device, id->port_num)) { + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) + ret = cma_join_ib_multicast(id_priv, mc); + else ret = -ENOSYS; - break; - } if (ret) { spin_lock_irq(&id_priv->lock); @@ -3422,19 +3784,15 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_put(&mc->mcref, release_mc); - break; - default: - break; - } - } + + BUG_ON(id_priv->cma_dev->device != id->device); + + if (rdma_cap_ib_mcast(id->device, id->port_num)) { + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + } else if (rdma_protocol_roce(id->device, id->port_num)) + kref_put(&mc->mcref, release_mc); + return; } } @@ -3450,6 +3808,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && + (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); @@ -3475,9 +3834,6 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; - if (dev_net(ndev) != &init_net) - return NOTIFY_DONE; - if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; @@ -3578,11 +3934,10 @@ static void cma_process_remove(struct cma_device *cma_dev) wait_for_completion(&cma_dev->comp); } -static void cma_remove_one(struct ib_device *device) +static void cma_remove_one(struct ib_device *device, void *client_data) { - struct cma_device *cma_dev; + struct cma_device *cma_dev = client_data; - cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; @@ -3673,6 +4028,35 @@ static const struct ibnl_client_cbs cma_cb_table[] = { .module = THIS_MODULE }, }; +static int cma_init_net(struct net *net) +{ + struct cma_pernet *pernet = cma_pernet(net); + + idr_init(&pernet->tcp_ps); + idr_init(&pernet->udp_ps); + idr_init(&pernet->ipoib_ps); + idr_init(&pernet->ib_ps); + + return 0; +} + +static void cma_exit_net(struct net *net) +{ + struct cma_pernet *pernet = cma_pernet(net); + + idr_destroy(&pernet->tcp_ps); + idr_destroy(&pernet->udp_ps); + idr_destroy(&pernet->ipoib_ps); + idr_destroy(&pernet->ib_ps); +} + +static struct pernet_operations cma_pernet_operations = { + .init = cma_init_net, + .exit = cma_exit_net, + .id = &cma_pernet_id, + .size = sizeof(struct cma_pernet), +}; + static int __init cma_init(void) { int ret; @@ -3681,6 +4065,10 @@ static int __init cma_init(void) if (!cma_wq) return -ENOMEM; + ret = register_pernet_subsys(&cma_pernet_operations); + if (ret) + goto err_wq; + ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); @@ -3698,6 +4086,7 @@ err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); +err_wq: destroy_workqueue(cma_wq); return ret; } @@ -3709,11 +4098,8 @@ static void __exit cma_cleanup(void) unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); - idr_destroy(&tcp_ps); - idr_destroy(&udp_ps); - idr_destroy(&ipoib_ps); - idr_destroy(&ib_ps); } module_init(cma_init); diff --git a/kernel/drivers/infiniband/core/core_priv.h b/kernel/drivers/infiniband/core/core_priv.h index 87d1936f5..5cf6eb716 100644 --- a/kernel/drivers/infiniband/core/core_priv.h +++ b/kernel/drivers/infiniband/core/core_priv.h @@ -43,12 +43,53 @@ int ib_device_register_sysfs(struct ib_device *device, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); -int ib_sysfs_setup(void); -void ib_sysfs_cleanup(void); - -int ib_cache_setup(void); +void ib_cache_setup(void); void ib_cache_cleanup(void); -int ib_resolve_eth_l2_attrs(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, int *qp_attr_mask); +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); + +enum ib_cache_gid_default_mode { + IB_CACHE_GID_DEFAULT_MODE_SET, + IB_CACHE_GID_DEFAULT_MODE_DELETE +}; + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + enum ib_cache_gid_default_mode mode); + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +int roce_rescan_device(struct ib_device *ib_dev); + +int ib_cache_setup_one(struct ib_device *device); +void ib_cache_cleanup_one(struct ib_device *device); +void ib_cache_release_one(struct ib_device *device); + #endif /* _CORE_PRIV_H */ diff --git a/kernel/drivers/infiniband/core/device.c b/kernel/drivers/infiniband/core/device.c index 18c1ece76..179e8134d 100644 --- a/kernel/drivers/infiniband/core/device.c +++ b/kernel/drivers/infiniband/core/device.c @@ -38,7 +38,10 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/netdevice.h> #include <rdma/rdma_netlink.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> #include "core_priv.h" @@ -50,22 +53,34 @@ struct ib_client_data { struct list_head list; struct ib_client *client; void * data; + /* The device or client is going down. Do not call client or device + * callbacks other than remove(). */ + bool going_down; }; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +/* The device_list and client_list contain devices and clients after their + * registration has completed, and the devices and clients are removed + * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* - * device_mutex protects access to both device_list and client_list. - * There's no real point to using multiple locks or something fancier - * like an rwsem: we always access both lists, and we're always - * modifying one list or the other list. In any case this is not a - * hot path so there's no point in trying to optimize. + * device_mutex and lists_rwsem protect access to both device_list and + * client_list. device_mutex protects writer access by device and client + * registration / de-registration. lists_rwsem protects reader access to + * these lists. Iterators of these lists must lock it for read, while updates + * to the lists must be done with a write lock. A special case is when the + * device_mutex is locked. In this case locking the lists for read access is + * not necessary as the device_mutex implies it. + * + * lists_rwsem also protects access to the client data list. */ static DEFINE_MUTEX(device_mutex); +static DECLARE_RWSEM(lists_rwsem); + static int ib_device_check_mandatory(struct ib_device *device) { @@ -92,7 +107,8 @@ static int ib_device_check_mandatory(struct ib_device *device) IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), - IB_MANDATORY_FUNC(dereg_mr) + IB_MANDATORY_FUNC(dereg_mr), + IB_MANDATORY_FUNC(get_port_immutable) }; int i; @@ -151,18 +167,36 @@ static int alloc_name(char *name) return 0; } -static int start_port(struct ib_device *device) +static void ib_device_release(struct device *device) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; -} + struct ib_device *dev = container_of(device, struct ib_device, dev); + ib_cache_release_one(dev); + kfree(dev->port_immutable); + kfree(dev); +} -static int end_port(struct ib_device *device) +static int ib_device_uevent(struct device *device, + struct kobj_uevent_env *env) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; + struct ib_device *dev = container_of(device, struct ib_device, dev); + + if (add_uevent_var(env, "NAME=%s", dev->name)) + return -ENOMEM; + + /* + * It would be nice to pass the node GUID with the event... + */ + + return 0; } +static struct class ib_class = { + .name = "infiniband", + .dev_release = ib_device_release, + .dev_uevent = ib_device_uevent, +}; + /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate @@ -175,9 +209,27 @@ static int end_port(struct ib_device *device) */ struct ib_device *ib_alloc_device(size_t size) { - BUG_ON(size < sizeof (struct ib_device)); + struct ib_device *device; + + if (WARN_ON(size < sizeof(struct ib_device))) + return NULL; + + device = kzalloc(size, GFP_KERNEL); + if (!device) + return NULL; + + device->dev.class = &ib_class; + device_initialize(&device->dev); + + dev_set_drvdata(&device->dev, device); + + INIT_LIST_HEAD(&device->event_handler_list); + spin_lock_init(&device->event_handler_lock); + spin_lock_init(&device->client_data_lock); + INIT_LIST_HEAD(&device->client_data_list); + INIT_LIST_HEAD(&device->port_list); - return kzalloc(size, GFP_KERNEL); + return device; } EXPORT_SYMBOL(ib_alloc_device); @@ -189,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - if (device->reg_state == IB_DEV_UNINITIALIZED) { - kfree(device); - return; - } - - BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); - + WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && + device->reg_state != IB_DEV_UNINITIALIZED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); @@ -214,51 +261,53 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->client = client; context->data = NULL; + context->going_down = false; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); return 0; } -static int read_port_table_lengths(struct ib_device *device) +static int verify_immutable(const struct ib_device *dev, u8 port) { - struct ib_port_attr *tprops = NULL; - int num_ports, ret = -ENOMEM; - u8 port_index; - - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - goto out; - - num_ports = end_port(device) - start_port(device) + 1; + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} - device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, - GFP_KERNEL); - device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, - GFP_KERNEL); - if (!device->pkey_tbl_len || !device->gid_tbl_len) - goto err; +static int read_port_immutable(struct ib_device *device) +{ + int ret; + u8 start_port = rdma_start_port(device); + u8 end_port = rdma_end_port(device); + u8 port; + + /** + * device->port_immutable is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_immutable is declared as a 1 based array with + * potential empty slots at the beginning. + */ + device->port_immutable = kzalloc(sizeof(*device->port_immutable) + * (end_port + 1), + GFP_KERNEL); + if (!device->port_immutable) + return -ENOMEM; - for (port_index = 0; port_index < num_ports; ++port_index) { - ret = ib_query_port(device, port_index + start_port(device), - tprops); + for (port = start_port; port <= end_port; ++port) { + ret = device->get_port_immutable(device, port, + &device->port_immutable[port]); if (ret) - goto err; - device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; - device->gid_tbl_len[port_index] = tprops->gid_tbl_len; - } - - ret = 0; - goto out; + return ret; -err: - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); -out: - kfree(tprops); - return ret; + if (verify_immutable(device, port)) + return -EINVAL; + } + return 0; } /** @@ -275,6 +324,7 @@ int ib_register_device(struct ib_device *device, u8, struct kobject *)) { int ret; + struct ib_client *client; mutex_lock(&device_mutex); @@ -289,40 +339,37 @@ int ib_register_device(struct ib_device *device, goto out; } - INIT_LIST_HEAD(&device->event_handler_list); - INIT_LIST_HEAD(&device->client_data_list); - spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); - - ret = read_port_table_lengths(device); + ret = read_port_immutable(device); if (ret) { - printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", + printk(KERN_WARNING "Couldn't create per port immutable data %s\n", device->name); goto out; } + ret = ib_cache_setup_one(device); + if (ret) { + printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + ib_cache_cleanup_one(device); goto out; } - list_add_tail(&device->core_list, &device_list); - device->reg_state = IB_DEV_REGISTERED; - { - struct ib_client *client; - - list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) - client->add(device); - } + list_for_each_entry(client, &client_list, list) + if (client->add && !add_client_context(device, client)) + client->add(device); - out: + down_write(&lists_rwsem); + list_add_tail(&device->core_list, &device_list); + up_write(&lists_rwsem); +out: mutex_unlock(&device_mutex); return ret; } @@ -336,29 +383,37 @@ EXPORT_SYMBOL(ib_register_device); */ void ib_unregister_device(struct ib_device *device) { - struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); - list_for_each_entry_reverse(client, &client_list, list) - if (client->remove) - client->remove(device); - + down_write(&lists_rwsem); list_del(&device->core_list); + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + context->going_down = true; + spin_unlock_irqrestore(&device->client_data_lock, flags); + downgrade_write(&lists_rwsem); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + if (context->client->remove) + context->client->remove(device, context->data); + } + up_read(&lists_rwsem); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); + ib_cache_cleanup_one(device); + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; } @@ -383,11 +438,14 @@ int ib_register_client(struct ib_client *client) mutex_lock(&device_mutex); - list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); + down_write(&lists_rwsem); + list_add_tail(&client->list, &client_list); + up_write(&lists_rwsem); + mutex_unlock(&device_mutex); return 0; @@ -410,19 +468,41 @@ void ib_unregister_client(struct ib_client *client) mutex_lock(&device_mutex); + down_write(&lists_rwsem); + list_del(&client->list); + up_write(&lists_rwsem); + list_for_each_entry(device, &device_list, core_list) { - if (client->remove) - client->remove(device); + struct ib_client_data *found_context = NULL; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { - list_del(&context->list); - kfree(context); + context->going_down = true; + found_context = context; + break; } spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); + + if (client->remove) + client->remove(device, found_context ? + found_context->data : NULL); + + if (!found_context) { + pr_warn("No client context found for %s/%s\n", + device->name, client->name); + continue; + } + + down_write(&lists_rwsem); + spin_lock_irqsave(&device->client_data_lock, flags); + list_del(&found_context->list); + kfree(found_context); + spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); } - list_del(&client->list); mutex_unlock(&device_mutex); } @@ -558,7 +638,11 @@ EXPORT_SYMBOL(ib_dispatch_event); int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr) { - return device->query_device(device, device_attr); + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + + memset(device_attr, 0, sizeof(*device_attr)); + + return device->query_device(device, device_attr, &uhw); } EXPORT_SYMBOL(ib_query_device); @@ -575,7 +659,7 @@ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->query_port(device, port_num, port_attr); @@ -588,17 +672,92 @@ EXPORT_SYMBOL(ib_query_port); * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID + * @attr: Returned GID attributes related to this GID index (only in RoCE). + * NULL means ignore. * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid) + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr) { + if (rdma_cap_roce_gid_table(device, port_num)) + return ib_get_cached_gid(device, port_num, index, gid, attr); + + if (attr) + return -EINVAL; + return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** + * ib_enum_roce_netdev - enumerate all RoCE ports + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports of ib_dev + * which are related to netdevice and calls callback() on each + * device for which filter() function returns non zero. + */ +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + u8 port; + + for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); + port++) + if (rdma_protocol_roce(ib_dev, port)) { + struct net_device *idev = NULL; + + if (ib_dev->get_netdev) + idev = ib_dev->get_netdev(ib_dev, port); + + if (idev && + idev->reg_state >= NETREG_UNREGISTERED) { + dev_put(idev); + idev = NULL; + } + + if (filter(ib_dev, port, idev, filter_cookie)) + cb(ib_dev, port, idev, cookie); + + if (idev) + dev_put(idev); + } +} + +/** + * ib_enum_all_roce_netdevs - enumerate all RoCE devices + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all RoCE devices' physical ports which are related + * to netdevices and calls callback() on each device for which + * filter() function returns non zero. + */ +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + struct ib_device *dev; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) + ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); + up_read(&lists_rwsem); +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -653,7 +812,7 @@ int ib_modify_port(struct ib_device *device, if (!device->modify_port) return -ENOSYS; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, @@ -666,19 +825,28 @@ EXPORT_SYMBOL(ib_modify_port); * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index) + struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; - for (port = start_port(device); port <= end_port(device); ++port) { - for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { - ret = ib_query_gid(device, port, i, &tmp_gid); + for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + if (rdma_cap_roce_gid_table(device, port)) { + if (!ib_find_cached_gid_by_port(device, gid, port, + ndev, index)) { + *port_num = port; + return 0; + } + } + + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { @@ -709,7 +877,7 @@ int ib_find_pkey(struct ib_device *device, u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; @@ -733,6 +901,51 @@ int ib_find_pkey(struct ib_device *device, } EXPORT_SYMBOL(ib_find_pkey); +/** + * ib_get_net_dev_by_params() - Return the appropriate net_dev + * for a received CM request + * @dev: An RDMA device on which the request has been received. + * @port: Port number on the RDMA device. + * @pkey: The Pkey the request came on. + * @gid: A GID that the net_dev uses to communicate. + * @addr: Contains the IP address that the request specified as its + * destination. + */ +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr) +{ + struct net_device *net_dev = NULL; + struct ib_client_data *context; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + down_read(&lists_rwsem); + + list_for_each_entry(context, &dev->client_data_list, list) { + struct ib_client *client = context->client; + + if (context->going_down) + continue; + + if (client->get_net_dev_by_params) { + net_dev = client->get_net_dev_by_params(dev, port, pkey, + gid, addr, + context->data); + if (net_dev) + break; + } + } + + up_read(&lists_rwsem); + + return net_dev; +} +EXPORT_SYMBOL(ib_get_net_dev_by_params); + static int __init ib_core_init(void) { int ret; @@ -741,7 +954,7 @@ static int __init ib_core_init(void) if (!ib_wq) return -ENOMEM; - ret = ib_sysfs_setup(); + ret = class_register(&ib_class); if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); goto err; @@ -753,19 +966,12 @@ static int __init ib_core_init(void) goto err_sysfs; } - ret = ib_cache_setup(); - if (ret) { - printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto err_nl; - } + ib_cache_setup(); return 0; -err_nl: - ibnl_cleanup(); - err_sysfs: - ib_sysfs_cleanup(); + class_unregister(&ib_class); err: destroy_workqueue(ib_wq); @@ -776,7 +982,7 @@ static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ibnl_cleanup(); - ib_sysfs_cleanup(); + class_unregister(&ib_class); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } diff --git a/kernel/drivers/infiniband/core/iwpm_msg.c b/kernel/drivers/infiniband/core/iwpm_msg.c index e6ffa2e66..22a3abee2 100644 --- a/kernel/drivers/infiniband/core/iwpm_msg.c +++ b/kernel/drivers/infiniband/core/iwpm_msg.c @@ -67,7 +67,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto pid_query_error; } - if (iwpm_registered_client(nl_client)) + if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || + iwpm_user_pid == IWPM_PID_UNAVAILABLE) return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); if (!skb) { @@ -106,7 +107,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ - iwpm_set_registered(nl_client, 1); iwpm_user_pid = IWPM_PID_UNAVAILABLE; err_str = "Unable to send a nlmsg"; goto pid_query_error; @@ -144,12 +144,12 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto add_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto add_mapping_error; } - if (!iwpm_valid_pid()) - return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; @@ -214,12 +214,12 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto query_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto query_mapping_error; } - if (!iwpm_valid_pid()) - return 0; ret = -ENOMEM; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); if (!skb) { @@ -288,12 +288,12 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) err_str = "Invalid port mapper client"; goto remove_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { err_str = "Unregistered port mapper client"; goto remove_mapping_error; } - if (!iwpm_valid_pid()) - return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); if (!skb) { ret = -ENOMEM; @@ -388,7 +388,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); if (iwpm_valid_client(nl_client)) - iwpm_set_registered(nl_client, 1); + iwpm_set_registration(nl_client, IWPM_REG_VALID); register_pid_response_exit: nlmsg_request->request_done = 1; /* always for found nlmsg_request */ @@ -644,7 +644,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; const char *msg_type = "Mapping Info response"; - int iwpm_pid; u8 nl_client; char *iwpm_name; u16 iwpm_version; @@ -669,14 +668,14 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) __func__, nl_client); return ret; } - iwpm_set_registered(nl_client, 0); + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_user_pid = cb->nlh->nlmsg_pid; if (!iwpm_mapinfo_available()) return 0; - iwpm_pid = cb->nlh->nlmsg_pid; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", - __func__, iwpm_pid); - ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + __func__, iwpm_user_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } EXPORT_SYMBOL(iwpm_mapping_info_cb); diff --git a/kernel/drivers/infiniband/core/iwpm_util.c b/kernel/drivers/infiniband/core/iwpm_util.c index a626795bf..5fb089e91 100644 --- a/kernel/drivers/infiniband/core/iwpm_util.c +++ b/kernel/drivers/infiniband/core/iwpm_util.c @@ -78,6 +78,7 @@ init_exit: mutex_unlock(&iwpm_admin_lock); if (!ret) { iwpm_set_valid(nl_client, 1); + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); } @@ -106,6 +107,7 @@ int iwpm_exit(u8 nl_client) } mutex_unlock(&iwpm_admin_lock); iwpm_set_valid(nl_client, 0); + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } EXPORT_SYMBOL(iwpm_exit); @@ -397,17 +399,23 @@ void iwpm_set_valid(u8 nl_client, int valid) } /* valid client */ -int iwpm_registered_client(u8 nl_client) +u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ -void iwpm_set_registered(u8 nl_client, int reg) +void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } +/* valid client */ +u32 iwpm_check_registration(u8 nl_client, u32 reg) +{ + return (iwpm_get_registration(nl_client) & reg); +} + int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { diff --git a/kernel/drivers/infiniband/core/iwpm_util.h b/kernel/drivers/infiniband/core/iwpm_util.h index ee2d9ff09..b7b9e194c 100644 --- a/kernel/drivers/infiniband/core/iwpm_util.h +++ b/kernel/drivers/infiniband/core/iwpm_util.h @@ -58,6 +58,10 @@ #define IWPM_PID_UNDEFINED -1 #define IWPM_PID_UNAVAILABLE -2 +#define IWPM_REG_UNDEF 0x01 +#define IWPM_REG_VALID 0x02 +#define IWPM_REG_INCOMPL 0x04 + struct iwpm_nlmsg_request { struct list_head inprocess_list; __u32 nlmsg_seq; @@ -88,7 +92,7 @@ struct iwpm_admin_data { atomic_t refcount; atomic_t nlmsg_seq; int client_list[RDMA_NL_NUM_CLIENTS]; - int reg_list[RDMA_NL_NUM_CLIENTS]; + u32 reg_list[RDMA_NL_NUM_CLIENTS]; }; /** @@ -159,19 +163,31 @@ int iwpm_valid_client(u8 nl_client); void iwpm_set_valid(u8 nl_client, int valid); /** - * iwpm_registered_client - Check if the port mapper client is registered + * iwpm_check_registration - Check if the client registration + * matches the given one * @nl_client: The index of the netlink client + * @reg: The given registration type to compare with * * Call iwpm_register_pid() to register a client + * Returns true if the client registration matches reg, + * otherwise returns false + */ +u32 iwpm_check_registration(u8 nl_client, u32 reg); + +/** + * iwpm_set_registration - Set the client registration + * @nl_client: The index of the netlink client + * @reg: Registration type to set */ -int iwpm_registered_client(u8 nl_client); +void iwpm_set_registration(u8 nl_client, u32 reg); /** - * iwpm_set_registered - Set the port mapper client to registered or not + * iwpm_get_registration * @nl_client: The index of the netlink client - * @reg: 1 if registered or 0 if not + * + * Returns the client registration type */ -void iwpm_set_registered(u8 nl_client, int reg); +u32 iwpm_get_registration(u8 nl_client); /** * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of diff --git a/kernel/drivers/infiniband/core/mad.c b/kernel/drivers/infiniband/core/mad.c index 74c30f4c5..2281de122 100644 --- a/kernel/drivers/infiniband/core/mad.c +++ b/kernel/drivers/infiniband/core/mad.c @@ -3,6 +3,7 @@ * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -44,6 +45,7 @@ #include "mad_priv.h" #include "mad_rmpp.h" #include "smi.h" +#include "opa_smi.h" #include "agent.h" MODULE_LICENSE("Dual BSD/GPL"); @@ -59,8 +61,6 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -static struct kmem_cache *ib_mad_cache; - static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; @@ -73,7 +73,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method, static void remove_mad_reg_req(struct ib_mad_agent_private *priv); static struct ib_mad_agent_private *find_mad_agent( struct ib_mad_port_private *port_priv, - struct ib_mad *mad); + const struct ib_mad_hdr *mad); static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad); static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); @@ -179,12 +179,12 @@ static int is_vendor_method_in_use( return 0; } -int ib_response_mad(struct ib_mad *mad) +int ib_response_mad(const struct ib_mad_hdr *hdr) { - return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || - (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || - ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && - (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); + return ((hdr->method & IB_MGMT_METHOD_RESP) || + (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((hdr->mgmt_class == IB_MGMT_CLASS_BM) && + (hdr->attr_mod & IB_BM_ATTR_MOD_RESP))); } EXPORT_SYMBOL(ib_response_mad); @@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } - mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(mad_agent_priv->agent.mr)) { - ret = ERR_PTR(-ENOMEM); - goto error2; - } - if (mad_reg_req) { reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { @@ -429,8 +422,6 @@ error4: spin_unlock_irqrestore(&port_priv->reg_lock, flags); kfree(reg_req); error3: - ib_dereg_mr(mad_agent_priv->agent.mr); -error2: kfree(mad_agent_priv); error1: return ret; @@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) wait_for_completion(&mad_agent_priv->comp); kfree(mad_agent_priv->reg_req); - ib_dereg_mr(mad_agent_priv->agent.mr); kfree(mad_agent_priv); } @@ -717,6 +707,32 @@ static void build_smp_wc(struct ib_qp *qp, wc->port_num = port_num; } +static size_t mad_priv_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_mad_private) + mp->mad_size; +} + +static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags) +{ + size_t size = sizeof(struct ib_mad_private) + mad_size; + struct ib_mad_private *ret = kzalloc(size, flags); + + if (ret) + ret->mad_size = mad_size; + + return ret; +} + +static size_t port_mad_size(const struct ib_mad_port_private *port_priv) +{ + return rdma_max_mad_size(port_priv->device, port_priv->port_num); +} + +static size_t mad_priv_dma_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_grh) + mp->mad_size; +} + /* * Return 0 if SMP is to be sent * Return 1 if SMP was consumed locally (whether or not solicited) @@ -727,6 +743,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, { int ret = 0; struct ib_smp *smp = mad_send_wr->send_buf.mad; + struct opa_smp *opa_smp = (struct opa_smp *)smp; unsigned long flags; struct ib_mad_local_private *local; struct ib_mad_private *mad_priv; @@ -735,11 +752,16 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_device *device = mad_agent_priv->agent.device; u8 port_num; struct ib_wc mad_wc; - struct ib_send_wr *send_wr = &mad_send_wr->send_wr; - - if (device->node_type == RDMA_NODE_IB_SWITCH && + struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; + size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); + u16 out_mad_pkey_index = 0; + u16 drslid; + bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + + if (rdma_cap_ib_switch(device) && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - port_num = send_wr->wr.ud.port_num; + port_num = send_wr->port_num; else port_num = mad_agent_priv->agent.port_num; @@ -749,19 +771,49 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ - if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == - IB_LID_PERMISSIVE && - smi_handle_dr_smp_send(smp, device->node_type, port_num) == - IB_SMI_DISCARD) { - ret = -EINVAL; - dev_err(&device->dev, "Invalid directed route\n"); - goto out; - } + if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) { + u32 opa_drslid; + + if ((opa_get_smp_direction(opa_smp) + ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == + OPA_LID_PERMISSIVE && + opa_smi_handle_dr_smp_send(opa_smp, + rdma_cap_ib_switch(device), + port_num) == IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid directed route\n"); + goto out; + } + opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); + if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && + opa_drslid & 0xffff0000) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", + opa_drslid); + goto out; + } + drslid = (u16)(opa_drslid & 0x0000ffff); - /* Check to post send on QP or process locally */ - if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && - smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) - goto out; + /* Check to post send on QP or process locally */ + if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD && + opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) + goto out; + } else { + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == + IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "Invalid directed route\n"); + goto out; + } + drslid = be16_to_cpu(smp->dr_slid); + + /* Check to post send on QP or process locally */ + if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && + smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) + goto out; + } local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { @@ -771,7 +823,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } local->mad_priv = NULL; local->recv_mad_agent = NULL; - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); + mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; dev_err(&device->dev, "No memory for local response MAD\n"); @@ -780,18 +832,25 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr_id, be16_to_cpu(smp->dr_slid), - send_wr->wr.ud.pkey_index, - send_wr->wr.ud.port_num, &mad_wc); + send_wr->wr.wr_id, drslid, + send_wr->pkey_index, + send_wr->port_num, &mad_wc); + + if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) { + mad_wc.byte_len = mad_send_wr->send_buf.hdr_len + + mad_send_wr->send_buf.data_len + + sizeof(struct ib_grh); + } /* No GRH for DR SMP */ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, - (struct ib_mad *)smp, - (struct ib_mad *)&mad_priv->mad); + (const struct ib_mad_hdr *)smp, mad_size, + (struct ib_mad_hdr *)mad_priv->mad, + &mad_size, &out_mad_pkey_index); switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: - if (ib_response_mad(&mad_priv->mad.mad) && + if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; @@ -801,39 +860,43 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, */ atomic_inc(&mad_agent_priv->refcount); } else - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, mad_agent_priv->agent.port_num); if (port_priv) { - memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad)); + memcpy(mad_priv->mad, smp, mad_priv->mad_size); recv_mad_agent = find_mad_agent(port_priv, - &mad_priv->mad.mad); + (const struct ib_mad_hdr *)mad_priv->mad); } if (!port_priv || !recv_mad_agent) { /* * No receiving agent so drop packet and * generate send completion. */ - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; break; default: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); kfree(local); ret = -EINVAL; goto out; } local->mad_send_wr = mad_send_wr; + if (opa) { + local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index; + local->return_wc_byte_len = mad_size; + } /* Reference MAD agent until send side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ @@ -847,11 +910,11 @@ out: return ret; } -static int get_pad_size(int hdr_len, int data_len) +static int get_pad_size(int hdr_len, int data_len, size_t mad_size) { int seg_size, pad; - seg_size = sizeof(struct ib_mad) - hdr_len; + seg_size = mad_size - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; return pad == seg_size ? 0 : pad; @@ -870,14 +933,15 @@ static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) } static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, - gfp_t gfp_mask) + size_t mad_size, gfp_t gfp_mask) { struct ib_mad_send_buf *send_buf = &send_wr->send_buf; struct ib_rmpp_mad *rmpp_mad = send_buf->mad; struct ib_rmpp_segment *seg = NULL; int left, seg_size, pad; - send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; + send_buf->seg_size = mad_size - send_buf->hdr_len; + send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR; seg_size = send_buf->seg_size; pad = send_wr->pad; @@ -910,7 +974,7 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, return 0; } -int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent) +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) { return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); } @@ -920,26 +984,37 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, - gfp_t gfp_mask) + gfp_t gfp_mask, + u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; int pad, message_size, ret, size; void *buf; + size_t mad_size; + bool opa; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); - pad = get_pad_size(hdr_len, data_len); + + opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num); + + if (opa && base_version == OPA_MGMT_BASE_VERSION) + mad_size = sizeof(struct opa_mad); + else + mad_size = sizeof(struct ib_mad); + + pad = get_pad_size(hdr_len, data_len, mad_size); message_size = hdr_len + data_len + pad; if (ib_mad_kernel_rmpp_agent(mad_agent)) { - if (!rmpp_active && message_size > sizeof(struct ib_mad)) + if (!rmpp_active && message_size > mad_size) return ERR_PTR(-EINVAL); } else - if (rmpp_active || message_size > sizeof(struct ib_mad)) + if (rmpp_active || message_size > mad_size) return ERR_PTR(-EINVAL); - size = rmpp_active ? hdr_len : sizeof(struct ib_mad); + size = rmpp_active ? hdr_len : mad_size; buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); @@ -953,21 +1028,28 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, mad_send_wr->mad_agent_priv = mad_agent_priv; mad_send_wr->sg_list[0].length = hdr_len; - mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; - mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; - mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; - - mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; - mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; - mad_send_wr->send_wr.num_sge = 2; - mad_send_wr->send_wr.opcode = IB_WR_SEND; - mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; - mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; - mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; - mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; + mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey; + + /* OPA MADs don't have to be the full 2048 bytes */ + if (opa && base_version == OPA_MGMT_BASE_VERSION && + data_len < mad_size - hdr_len) + mad_send_wr->sg_list[1].length = data_len; + else + mad_send_wr->sg_list[1].length = mad_size - hdr_len; + + mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; + + mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr; + mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; + mad_send_wr->send_wr.wr.num_sge = 2; + mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; + mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED; + mad_send_wr->send_wr.remote_qpn = remote_qpn; + mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY; + mad_send_wr->send_wr.pkey_index = pkey_index; if (rmpp_active) { - ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); + ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask); if (ret) { kfree(buf); return ERR_PTR(ret); @@ -1069,7 +1151,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list; + mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; mad_agent = mad_send_wr->send_buf.mad_agent; @@ -1097,7 +1179,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { - ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr, + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); list = &qp_info->send_queue.list; } else { @@ -1162,7 +1244,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, * request associated with the completion */ next_send_buf = send_buf->next; - mad_send_wr->send_wr.wr.ud.ah = send_buf->ah; + mad_send_wr->send_wr.ah = send_buf->ah; if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { @@ -1237,7 +1319,7 @@ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) recv_wc); priv = container_of(mad_priv_hdr, struct ib_mad_private, header); - kmem_cache_free(ib_mad_cache, priv); + kfree(priv); } } EXPORT_SYMBOL(ib_free_recv_mad); @@ -1324,7 +1406,7 @@ static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) } static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, - char *oui) + const char *oui) { int i; @@ -1622,13 +1704,13 @@ out: static struct ib_mad_agent_private * find_mad_agent(struct ib_mad_port_private *port_priv, - struct ib_mad *mad) + const struct ib_mad_hdr *mad_hdr) { struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); - if (ib_response_mad(mad)) { + if (ib_response_mad(mad_hdr)) { u32 hi_tid; struct ib_mad_agent_private *entry; @@ -1636,7 +1718,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, * Routing is based on high 32 bits of transaction ID * of MAD. */ - hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32; + hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; list_for_each_entry(entry, &port_priv->agent_list, agent_list) { if (entry->agent.hi_tid == hi_tid) { mad_agent = entry; @@ -1648,45 +1730,45 @@ find_mad_agent(struct ib_mad_port_private *port_priv, struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; - struct ib_vendor_mad *vendor_mad; + const struct ib_vendor_mad *vendor_mad; int index; /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI */ - if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) + if (mad_hdr->class_version >= MAX_MGMT_VERSION) goto out; - if (!is_vendor_class(mad->mad_hdr.mgmt_class)) { + if (!is_vendor_class(mad_hdr->mgmt_class)) { class = port_priv->version[ - mad->mad_hdr.class_version].class; + mad_hdr->class_version].class; if (!class) goto out; - if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + if (convert_mgmt_class(mad_hdr->mgmt_class) >= IB_MGMT_MAX_METHODS) goto out; method = class->method_table[convert_mgmt_class( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (method) - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } else { vendor = port_priv->version[ - mad->mad_hdr.class_version].vendor; + mad_hdr->class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[vendor_class_index( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (!vendor_class) goto out; /* Find matching OUI */ - vendor_mad = (struct ib_vendor_mad *)mad; + vendor_mad = (const struct ib_vendor_mad *)mad_hdr; index = find_vendor_oui(vendor_class, vendor_mad->oui); if (index == -1) goto out; method = vendor_class->method_table[index]; if (method) { - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } } @@ -1708,23 +1790,32 @@ out: return mad_agent; } -static int validate_mad(struct ib_mad *mad, u32 qp_num) +static int validate_mad(const struct ib_mad_hdr *mad_hdr, + const struct ib_mad_qp_info *qp_info, + bool opa) { int valid = 0; + u32 qp_num = qp_info->qp->qp_num; /* Make sure MAD base version is understood */ - if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - pr_err("MAD received with unsupported base version %d\n", - mad->mad_hdr.base_version); + if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && + (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { + pr_err("MAD received with unsupported base version %d %s\n", + mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } /* Filter SMI packets sent to other than QP0 */ - if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || - (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; @@ -1734,8 +1825,8 @@ out: return valid; } -static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_hdr *mad_hdr) +static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_hdr *mad_hdr) { struct ib_rmpp_mad *rmpp_mad; @@ -1747,16 +1838,16 @@ static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); } -static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc) +static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) { - return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class == + return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class == rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc ) +static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; @@ -1765,8 +1856,8 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; - send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); - rcv_resp = ib_response_mad(rwc->recv_buf.mad); + send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); + rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ @@ -1791,7 +1882,7 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, - attr.grh.sgid_index, &sgid)) + attr.grh.sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); @@ -1811,22 +1902,22 @@ static inline int is_direct(u8 class) } struct ib_mad_send_wr_private* -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *wc) +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *wc) { struct ib_mad_send_wr_private *wr; - struct ib_mad *mad; + const struct ib_mad_hdr *mad_hdr; - mad = (struct ib_mad *)wc->recv_buf.mad; + mad_hdr = &wc->recv_buf.mad->mad_hdr; list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { - if ((wr->tid == mad->mad_hdr.tid) && + if ((wr->tid == mad_hdr->tid) && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } @@ -1836,15 +1927,15 @@ ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, * been notified that the send has completed */ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && - wr->tid == mad->mad_hdr.tid && + if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad_hdr->tid && wr->timeout && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ return (wr->status == IB_WC_SUCCESS) ? wr : NULL; @@ -1879,7 +1970,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } /* Complete corresponding request */ - if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { + if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { @@ -1924,26 +2015,163 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } -static bool generate_unmatched_resp(struct ib_mad_private *recv, - struct ib_mad_private *response) +static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, + const struct ib_mad_qp_info *qp_info, + const struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) { - if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || - recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { - memcpy(response, recv, sizeof *response); + enum smi_forward_action retsmi; + struct ib_smp *smp = (struct ib_smp *)recv->mad; + + if (smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; - response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; - response->mad.mad.mad_hdr.status = - cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); - if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + smi_get_fwd_port(smp), + qp_info->qp->qp_num, + response->mad_size, + false); + + return IB_SMI_DISCARD; + } + return IB_SMI_HANDLE; +} + +static bool generate_unmatched_resp(const struct ib_mad_private *recv, + struct ib_mad_private *response, + size_t *resp_len, bool opa) +{ + const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad; + struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad; + + if (recv_hdr->method == IB_MGMT_METHOD_GET || + recv_hdr->method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + resp_hdr->method = IB_MGMT_METHOD_GET_RESP; + resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + resp_hdr->status |= IB_SMP_DIRECTION; + + if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) { + if (recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED || + recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + *resp_len = opa_get_smp_header_size( + (struct opa_smp *)recv->mad); + else + *resp_len = sizeof(struct ib_mad_hdr); + } return true; } else { return false; } } + +static enum smi_action +handle_opa_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + enum smi_forward_action retsmi; + struct opa_smp *smp = (struct opa_smp *)recv->mad; + + if (opa_smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = opa_smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (opa_smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (opa_smi_check_local_smp(smp, port_priv->device) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.opa_mad = + (struct opa_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + opa_smi_get_fwd_port(smp), + qp_info->qp->qp_num, + recv->header.wc.byte_len, + true); + + return IB_SMI_DISCARD; + } + + return IB_SMI_HANDLE; +} + +static enum smi_action +handle_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response, + bool opa) +{ + struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad; + + if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION && + mad_hdr->class_version == OPA_SMI_CLASS_VERSION) + return handle_opa_smi(port_priv, qp_info, wc, port_num, recv, + response); + + return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); +} + static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { @@ -1954,109 +2182,97 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; + size_t mad_size; + u16 resp_mad_pkey_index = 0; + bool opa; mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); + opa = rdma_cap_opa_mad(qp_info->port_priv->device, + qp_info->port_priv->port_num); + mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); ib_dma_unmap_single(port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); /* Setup MAD receive work completion from "normal" work completion */ recv->header.wc = *wc; recv->header.recv_wc.wc = &recv->header.wc; - recv->header.recv_wc.mad_len = sizeof(struct ib_mad); - recv->header.recv_wc.recv_buf.mad = &recv->mad.mad; + + if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) { + recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh); + recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + recv->header.recv_wc.mad_len = sizeof(struct ib_mad); + recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + + recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; if (atomic_read(&qp_info->snoop_count)) snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); /* Validate MAD */ - if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num)) + if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; - response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_size = recv->mad_size; + response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { dev_err(&port_priv->device->dev, "ib_mad_recv_done_handler no memory for response buffer\n"); goto out; } - if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(port_priv->device)) port_num = wc->port_num; else port_num = port_priv->port_num; - if (recv->mad.mad.mad_hdr.mgmt_class == + if (((struct ib_mad_hdr *)recv->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - enum smi_forward_action retsmi; - - if (smi_handle_dr_smp_recv(&recv->mad.smp, - port_priv->device->node_type, - port_num, - port_priv->device->phys_port_cnt) == - IB_SMI_DISCARD) + if (handle_smi(port_priv, qp_info, wc, port_num, recv, + response, opa) + == IB_SMI_DISCARD) goto out; - - retsmi = smi_check_forward_dr_smp(&recv->mad.smp); - if (retsmi == IB_SMI_LOCAL) - goto local; - - if (retsmi == IB_SMI_SEND) { /* don't forward */ - if (smi_handle_dr_smp_send(&recv->mad.smp, - port_priv->device->node_type, - port_num) == IB_SMI_DISCARD) - goto out; - - if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD) - goto out; - } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { - /* forward case for switches */ - memcpy(response, recv, sizeof(*response)); - response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; - response->header.recv_wc.recv_buf.grh = &response->grh; - - agent_send_response(&response->mad.mad, - &response->grh, wc, - port_priv->device, - smi_get_fwd_port(&recv->mad.smp), - qp_info->qp->qp_num); - - goto out; - } } -local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, - &recv->mad.mad, - &response->mad.mad); + (const struct ib_mad_hdr *)recv->mad, + recv->mad_size, + (struct ib_mad_hdr *)response->mad, + &mad_size, &resp_mad_pkey_index); + + if (opa) + wc->pkey_index = resp_mad_pkey_index; + if (ret & IB_MAD_RESULT_SUCCESS) { if (ret & IB_MAD_RESULT_CONSUMED) goto out; if (ret & IB_MAD_RESULT_REPLY) { - agent_send_response(&response->mad.mad, + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, port_priv->device, port_num, - qp_info->qp->qp_num); + qp_info->qp->qp_num, + mad_size, opa); goto out; } } } - mad_agent = find_mad_agent(port_priv, &recv->mad.mad); + mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* @@ -2065,17 +2281,17 @@ local: */ recv = NULL; } else if ((ret & IB_MAD_RESULT_SUCCESS) && - generate_unmatched_resp(recv, response)) { - agent_send_response(&response->mad.mad, &recv->grh, wc, - port_priv->device, port_num, qp_info->qp->qp_num); + generate_unmatched_resp(recv, response, &mad_size, opa)) { + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, + port_priv->device, port_num, + qp_info->qp->qp_num, mad_size, opa); } out: /* Post another receive request for this QP */ if (response) { ib_mad_post_receive_mads(qp_info, response); - if (recv) - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } else ib_mad_post_receive_mads(qp_info, recv); } @@ -2246,7 +2462,7 @@ retry: ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { - ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, &bad_send_wr); if (ret) { dev_err(&port_priv->device->dev, @@ -2304,7 +2520,7 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, struct ib_send_wr *bad_send_wr; mad_send_wr->retry = 0; - ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); if (ret) ib_mad_send_done_handler(port_priv, wc); @@ -2411,7 +2627,8 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv, list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && + if (is_rmpp_data_mad(mad_agent_priv, + mad_send_wr->send_buf.mad) && &mad_send_wr->send_buf == send_buf) return mad_send_wr; } @@ -2468,10 +2685,14 @@ static void local_completions(struct work_struct *work) int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; + bool opa; mad_agent_priv = container_of(work, struct ib_mad_agent_private, local_work); + opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->local_list)) { local = list_entry(mad_agent_priv->local_list.next, @@ -2481,6 +2702,7 @@ static void local_completions(struct work_struct *work) spin_unlock_irqrestore(&mad_agent_priv->lock, flags); free_mad = 0; if (local->mad_priv) { + u8 base_version; recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { dev_err(&mad_agent_priv->agent.device->dev, @@ -2496,17 +2718,26 @@ static void local_completions(struct work_struct *work) build_smp_wc(recv_mad_agent->agent.qp, (unsigned long) local->mad_send_wr, be16_to_cpu(IB_LID_PERMISSIVE), - 0, recv_mad_agent->agent.port_num, &wc); + local->mad_send_wr->send_wr.pkey_index, + recv_mad_agent->agent.port_num, &wc); local->mad_priv->header.recv_wc.wc = &wc; - local->mad_priv->header.recv_wc.mad_len = - sizeof(struct ib_mad); + + base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version; + if (opa && base_version == OPA_MGMT_BASE_VERSION) { + local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len; + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); list_add(&local->mad_priv->header.recv_wc.recv_buf.list, &local->mad_priv->header.recv_wc.rmpp_list); local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = - &local->mad_priv->mad.mad; + (struct ib_mad *)local->mad_priv->mad; if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) snoop_recv(recv_mad_agent->qp_info, &local->mad_priv->header.recv_wc, @@ -2534,7 +2765,7 @@ local_send_completion: spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); if (free_mad) - kmem_cache_free(ib_mad_cache, local->mad_priv); + kfree(local->mad_priv); kfree(local); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); @@ -2649,8 +2880,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ - sg_list.length = sizeof *mad_priv - sizeof mad_priv->header; - sg_list.lkey = (*qp_info->port_priv->mr).lkey; + sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; /* Initialize common receive WR fields */ recv_wr.next = NULL; @@ -2663,7 +2893,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, mad_priv = mad; mad = NULL; } else { - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), + GFP_ATOMIC); if (!mad_priv) { dev_err(&qp_info->port_priv->device->dev, "No memory for receive buffer\n"); @@ -2671,10 +2902,10 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, break; } } + sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, &mad_priv->grh, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { @@ -2698,10 +2929,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, spin_unlock_irqrestore(&recv_queue->lock, flags); ib_dma_unmap_single(qp_info->port_priv->device, mad_priv->header.mapping, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); dev_err(&qp_info->port_priv->device->dev, "ib_post_recv failed: %d\n", ret); break; @@ -2738,10 +2968,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) ib_dma_unmap_single(qp_info->port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } qp_info->recv_queue.count = 0; @@ -2922,6 +3151,14 @@ static int ib_mad_port_open(struct ib_device *device, unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; + struct ib_cq_init_attr cq_attr = {}; + + if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) + return -EFAULT; + + if (WARN_ON(rdma_cap_opa_mad(device, port_num) && + rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE)) + return -EFAULT; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); @@ -2938,13 +3175,14 @@ static int ib_mad_port_open(struct ib_device *device, init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; - has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + has_smi = rdma_cap_ib_smi(device, port_num); if (has_smi) cq_size *= 2; + cq_attr.cqe = cq_size; port_priv->cq = ib_create_cq(port_priv->device, ib_mad_thread_completion_handler, - NULL, port_priv, cq_size, 0); + NULL, port_priv, &cq_attr); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); @@ -2958,13 +3196,6 @@ static int ib_mad_port_open(struct ib_device *device, goto error4; } - port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(port_priv->mr)) { - dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); - ret = PTR_ERR(port_priv->mr); - goto error5; - } - if (has_smi) { ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); if (ret) @@ -3005,8 +3236,6 @@ error8: error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: - ib_dereg_mr(port_priv->mr); -error5: ib_dealloc_pd(port_priv->pd); error4: ib_destroy_cq(port_priv->cq); @@ -3041,7 +3270,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_workqueue(port_priv->wq); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); - ib_dereg_mr(port_priv->mr); ib_dealloc_pd(port_priv->pd); ib_destroy_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); @@ -3055,20 +3283,14 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) static void ib_mad_init_device(struct ib_device *device) { - int start, end, i; + int start, i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + start = rdma_start_port(device); - if (device->node_type == RDMA_NODE_IB_SWITCH) { - start = 0; - end = 0; - } else { - start = 1; - end = device->phys_port_cnt; - } + for (i = start; i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; @@ -3086,40 +3308,31 @@ error_agent: dev_err(&device->dev, "Couldn't close port %d\n", i); error: - i--; + while (--i >= start) { + if (!rdma_cap_ib_mad(device, i)) + continue; - while (i >= start) { if (ib_agent_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); - i--; } } -static void ib_mad_remove_device(struct ib_device *device) +static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i, num_ports, cur_port; + int i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - num_ports = 1; - cur_port = 0; - } else { - num_ports = device->phys_port_cnt; - cur_port = 1; - } - for (i = 0; i < num_ports; i++, cur_port++) { - if (ib_agent_port_close(device, cur_port)) + if (ib_agent_port_close(device, i)) dev_err(&device->dev, - "Couldn't close port %d for agents\n", - cur_port); - if (ib_mad_port_close(device, cur_port)) - dev_err(&device->dev, "Couldn't close port %d\n", - cur_port); + "Couldn't close port %d for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); } } @@ -3131,45 +3344,25 @@ static struct ib_client mad_client = { static int __init ib_mad_init_module(void) { - int ret; - mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); - ib_mad_cache = kmem_cache_create("ib_mad", - sizeof(struct ib_mad_private), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!ib_mad_cache) { - pr_err("Couldn't create ib_mad cache\n"); - ret = -ENOMEM; - goto error1; - } - INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { pr_err("Couldn't register ib_mad client\n"); - ret = -EINVAL; - goto error2; + return -EINVAL; } return 0; - -error2: - kmem_cache_destroy(ib_mad_cache); -error1: - return ret; } static void __exit ib_mad_cleanup_module(void) { ib_unregister_client(&mad_client); - kmem_cache_destroy(ib_mad_cache); } module_init(ib_mad_init_module); diff --git a/kernel/drivers/infiniband/core/mad_priv.h b/kernel/drivers/infiniband/core/mad_priv.h index d1a0b0ee9..990698a6a 100644 --- a/kernel/drivers/infiniband/core/mad_priv.h +++ b/kernel/drivers/infiniband/core/mad_priv.h @@ -41,6 +41,7 @@ #include <linux/workqueue.h> #include <rdma/ib_mad.h> #include <rdma/ib_smi.h> +#include <rdma/opa_smi.h> #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ @@ -56,7 +57,7 @@ /* Registration table sizes */ #define MAX_MGMT_CLASS 80 -#define MAX_MGMT_VERSION 8 +#define MAX_MGMT_VERSION 0x83 #define MAX_MGMT_OUI 8 #define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1) @@ -75,12 +76,9 @@ struct ib_mad_private_header { struct ib_mad_private { struct ib_mad_private_header header; + size_t mad_size; struct ib_grh grh; - union { - struct ib_mad mad; - struct ib_rmpp_mad rmpp_mad; - struct ib_smp smp; - } mad; + u8 mad[0]; } __attribute__ ((packed)); struct ib_rmpp_segment { @@ -125,7 +123,7 @@ struct ib_mad_send_wr_private { struct ib_mad_send_buf send_buf; u64 header_mapping; u64 payload_mapping; - struct ib_send_wr send_wr; + struct ib_ud_wr send_wr; struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; unsigned long timeout; @@ -150,6 +148,7 @@ struct ib_mad_local_private { struct ib_mad_private *mad_priv; struct ib_mad_agent_private *recv_mad_agent; struct ib_mad_send_wr_private *mad_send_wr; + size_t return_wc_byte_len; }; struct ib_mad_mgmt_method_table { @@ -200,7 +199,6 @@ struct ib_mad_port_private { int port_num; struct ib_cq *cq; struct ib_pd *pd; - struct ib_mr *mr; spinlock_t reg_lock; struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; @@ -213,8 +211,8 @@ struct ib_mad_port_private { int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); struct ib_mad_send_wr_private * -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *mad_recv_wc); +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *mad_recv_wc); void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc); diff --git a/kernel/drivers/infiniband/core/mad_rmpp.c b/kernel/drivers/infiniband/core/mad_rmpp.c index f37878c9c..382941b46 100644 --- a/kernel/drivers/infiniband/core/mad_rmpp.c +++ b/kernel/drivers/infiniband/core/mad_rmpp.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2005 Intel Inc. All rights reserved. * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -67,6 +68,7 @@ struct mad_rmpp_recv { u8 mgmt_class; u8 class_version; u8 method; + u8 base_version; }; static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) @@ -139,7 +141,8 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv, hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, - 0, GFP_KERNEL); + 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) return; @@ -165,7 +168,8 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, - hdr_len, 0, GFP_KERNEL); + hdr_len, 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) ib_destroy_ah(ah); else { @@ -316,6 +320,7 @@ create_rmpp_recv(struct ib_mad_agent_private *agent, rmpp_recv->mgmt_class = mad_hdr->mgmt_class; rmpp_recv->class_version = mad_hdr->class_version; rmpp_recv->method = mad_hdr->method; + rmpp_recv->base_version = mad_hdr->base_version; return rmpp_recv; error: kfree(rmpp_recv); @@ -431,14 +436,23 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) { struct ib_rmpp_mad *rmpp_mad; int hdr_size, data_size, pad; + bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device, + rmpp_recv->agent->qp_info->port_priv->port_num); rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - data_size = sizeof(struct ib_rmpp_mad) - hdr_size; - pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); - if (pad > IB_MGMT_RMPP_DATA || pad < 0) - pad = 0; + if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) { + data_size = sizeof(struct opa_rmpp_mad) - hdr_size; + pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > OPA_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } else { + data_size = sizeof(struct ib_rmpp_mad) - hdr_size; + pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > IB_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } return hdr_size + rmpp_recv->seg_num * data_size - pad; } @@ -570,13 +584,14 @@ static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) if (mad_send_wr->seg_num == 1) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; - paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA - - mad_send_wr->pad; + paylen = (mad_send_wr->send_buf.seg_count * + mad_send_wr->send_buf.seg_rmpp_size) - + mad_send_wr->pad; } if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; - paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad; + paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad; } rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); diff --git a/kernel/drivers/infiniband/core/multicast.c b/kernel/drivers/infiniband/core/multicast.c index fa17b552f..bb6685fb0 100644 --- a/kernel/drivers/infiniband/core/multicast.c +++ b/kernel/drivers/infiniband/core/multicast.c @@ -43,7 +43,7 @@ #include "sa.h" static void mcast_add_one(struct ib_device *device); -static void mcast_remove_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", @@ -729,7 +729,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); + ret = ib_find_cached_gid(device, &rec->port_gid, + NULL, &p, &gid_index); if (ret) return ret; @@ -780,8 +781,7 @@ static void mcast_event_handler(struct ib_event_handler *handler, int index; dev = container_of(handler, struct mcast_device, event_handler); - if (rdma_port_get_link_layer(dev->device, event->element.port_num) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; @@ -808,24 +808,16 @@ static void mcast_add_one(struct ib_device *device) int i; int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; - if (device->node_type == RDMA_NODE_IB_SWITCH) - dev->start_port = dev->end_port = 0; - else { - dev->start_port = 1; - dev->end_port = device->phys_port_cnt; - } + dev->start_port = rdma_start_port(device); + dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; @@ -849,13 +841,12 @@ static void mcast_add_one(struct ib_device *device) ib_register_event_handler(&dev->event_handler); } -static void mcast_remove_one(struct ib_device *device) +static void mcast_remove_one(struct ib_device *device, void *client_data) { - struct mcast_device *dev; + struct mcast_device *dev = client_data; struct mcast_port *port; int i; - dev = ib_get_client_data(device, &mcast_client); if (!dev) return; @@ -863,8 +854,7 @@ static void mcast_remove_one(struct ib_device *device) flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) == - IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); diff --git a/kernel/drivers/infiniband/core/netlink.c b/kernel/drivers/infiniband/core/netlink.c index 23dd5a5c7..d47df9356 100644 --- a/kernel/drivers/infiniband/core/netlink.c +++ b/kernel/drivers/infiniband/core/netlink.c @@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex); static struct sock *nls; static LIST_HEAD(client_list); +int ibnl_chk_listeners(unsigned int group) +{ + if (netlink_has_listeners(nls, group) == 0) + return -1; + return 0; +} +EXPORT_SYMBOL(ibnl_chk_listeners); + int ibnl_add_client(int index, int nops, const struct ibnl_client_cbs cb_table[]) { @@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) !client->cb_table[op].dump) return -EINVAL; + /* + * For response or local service set_timeout request, + * there is no need to use netlink_dump_start. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + (index == RDMA_NL_LS && + op == RDMA_NL_LS_OP_SET_TIMEOUT)) { + struct netlink_callback cb = { + .skb = skb, + .nlh = nlh, + .dump = client->cb_table[op].dump, + .module = client->cb_table[op].module, + }; + + return cb.dump(skb, &cb); + } + { struct netlink_dump_control c = { .dump = client->cb_table[op].dump, @@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; } +static void ibnl_rcv_reply_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + int msglen; + + /* + * Process responses until there is no more message or the first + * request. Generally speaking, it is not recommended to mix responses + * with requests. + */ + while (skb->len >= nlmsg_total_size(0)) { + nlh = nlmsg_hdr(skb); + + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return; + + /* Handle response only */ + if (nlh->nlmsg_flags & NLM_F_REQUEST) + return; + + ibnl_rcv_msg(skb, nlh); + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } +} + static void ibnl_rcv(struct sk_buff *skb) { mutex_lock(&ibnl_mutex); + ibnl_rcv_reply_skb(skb); netlink_rcv_skb(skb, &ibnl_rcv_msg); mutex_unlock(&ibnl_mutex); } diff --git a/kernel/drivers/infiniband/core/opa_smi.h b/kernel/drivers/infiniband/core/opa_smi.h new file mode 100644 index 000000000..3bfab3505 --- /dev/null +++ b/kernel/drivers/infiniband/core/opa_smi.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __OPA_SMI_H_ +#define __OPA_SMI_H_ + +#include <rdma/ib_smi.h> +#include <rdma/opa_smi.h> + +#include "smi.h" + +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt); +int opa_smi_get_fwd_port(struct opa_smp *smp); +extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); +extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return (device->process_mad && + !opa_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return (device->process_mad && + opa_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +#endif /* __OPA_SMI_H_ */ diff --git a/kernel/drivers/infiniband/core/roce_gid_mgmt.c b/kernel/drivers/infiniband/core/roce_gid_mgmt.c new file mode 100644 index 000000000..178f98482 --- /dev/null +++ b/kernel/drivers/infiniband/core/roce_gid_mgmt.c @@ -0,0 +1,747 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include <linux/in.h> +#include <linux/in6.h> + +/* For in6_dev_get/in6_dev_put */ +#include <net/addrconf.h> +#include <net/bonding.h> + +#include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> + +enum gid_op_type { + GID_DEL = 0, + GID_ADD +}; + +struct update_gid_event_work { + struct work_struct work; + union ib_gid gid; + struct ib_gid_attr gid_attr; + enum gid_op_type gid_op; +}; + +#define ROCE_NETDEV_CALLBACK_SZ 3 +struct netdev_event_work_cmd { + roce_netdev_callback cb; + roce_netdev_filter filter; + struct net_device *ndev; + struct net_device *filter_ndev; +}; + +struct netdev_event_work { + struct work_struct work; + struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; +}; + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, + u8 port, union ib_gid *gid, + struct ib_gid_attr *gid_attr) +{ + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, gid, gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, gid, gid_attr); + break; + } +} + +enum bonding_slave_state { + BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, + BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, + /* No primary slave or the device isn't a slave in bonding */ + BONDING_SLAVE_STATE_NA = 1UL << 2, +}; + +static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, + struct net_device *upper) +{ + if (upper && netif_is_bond_master(upper)) { + struct net_device *pdev = + bond_option_active_slave_get_rcu(netdev_priv(upper)); + + if (pdev) + return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : + BONDING_SLAVE_STATE_INACTIVE; + } + + return BONDING_SLAVE_STATE_NA; +} + +static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) +{ + struct net_device *_upper = NULL; + struct list_head *iter; + + netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) + if (_upper == upper) + break; + + return _upper == upper; +} + +#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ + BONDING_SLAVE_STATE_NA) +static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = (struct net_device *)cookie; + struct net_device *real_dev; + int res; + + if (!rdma_ndev) + return 0; + + rcu_read_lock(); + real_dev = rdma_vlan_dev_real_dev(event_ndev); + if (!real_dev) + real_dev = event_ndev; + + res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) && + (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & + REQUIRED_BOND_STATES)) || + real_dev == rdma_ndev); + + rcu_read_unlock(); + return res; +} + +static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *master_dev; + int res; + + if (!rdma_ndev) + return 0; + + rcu_read_lock(); + master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); + res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == + BONDING_SLAVE_STATE_INACTIVE; + rcu_read_unlock(); + + return res; +} + +static int pass_all_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + return 1; +} + +static int upper_device_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = (struct net_device *)cookie; + int res; + + if (!rdma_ndev) + return 0; + + if (rdma_ndev == event_ndev) + return 1; + + rcu_read_lock(); + res = is_upper_dev_rcu(rdma_ndev, event_ndev); + rcu_read_unlock(); + + return res; +} + +static void update_gid_ip(enum gid_op_type gid_op, + struct ib_device *ib_dev, + u8 port, struct net_device *ndev, + struct sockaddr *addr) +{ + union ib_gid gid; + struct ib_gid_attr gid_attr; + + rdma_ip2gid(addr, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + update_gid(gid_op, ib_dev, port, &gid, &gid_attr); +} + +static void enum_netdev_default_gids(struct ib_device *ib_dev, + u8 port, struct net_device *event_ndev, + struct net_device *rdma_ndev) +{ + rcu_read_lock(); + if (!rdma_ndev || + ((rdma_ndev != event_ndev && + !is_upper_dev_rcu(rdma_ndev, event_ndev)) || + is_eth_active_slave_of_bonding_rcu(rdma_ndev, + netdev_master_upper_dev_get_rcu(rdma_ndev)) == + BONDING_SLAVE_STATE_INACTIVE)) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + IB_CACHE_GID_DEFAULT_MODE_SET); +} + +static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, + u8 port, + struct net_device *event_ndev, + struct net_device *rdma_ndev) +{ + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); + + if (!rdma_ndev) + return; + + if (!real_dev) + real_dev = event_ndev; + + rcu_read_lock(); + + if (is_upper_dev_rcu(rdma_ndev, event_ndev) && + is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == + BONDING_SLAVE_STATE_INACTIVE) { + rcu_read_unlock(); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + IB_CACHE_GID_DEFAULT_MODE_DELETE); + } else { + rcu_read_unlock(); + } +} + +static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, + u8 port, struct net_device *ndev) +{ + struct in_device *in_dev; + struct sin_list { + struct list_head list; + struct sockaddr_in ip; + }; + struct sin_list *sin_iter; + struct sin_list *sin_temp; + + LIST_HEAD(sin_list); + if (ndev->reg_state >= NETREG_UNREGISTERING) + return; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(ndev); + if (!in_dev) { + rcu_read_unlock(); + return; + } + + for_ifa(in_dev) { + struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + + if (!entry) { + pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv4 update\n"); + continue; + } + entry->ip.sin_family = AF_INET; + entry->ip.sin_addr.s_addr = ifa->ifa_address; + list_add_tail(&entry->list, &sin_list); + } + endfor_ifa(in_dev); + rcu_read_unlock(); + + list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { + update_gid_ip(GID_ADD, ib_dev, port, ndev, + (struct sockaddr *)&sin_iter->ip); + list_del(&sin_iter->list); + kfree(sin_iter); + } +} + +static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, + u8 port, struct net_device *ndev) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *in6_dev; + struct sin6_list { + struct list_head list; + struct sockaddr_in6 sin6; + }; + struct sin6_list *sin6_iter; + struct sin6_list *sin6_temp; + struct ib_gid_attr gid_attr = {.ndev = ndev}; + LIST_HEAD(sin6_list); + + if (ndev->reg_state >= NETREG_UNREGISTERING) + return; + + in6_dev = in6_dev_get(ndev); + if (!in6_dev) + return; + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + + if (!entry) { + pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n"); + continue; + } + + entry->sin6.sin6_family = AF_INET6; + entry->sin6.sin6_addr = ifp->addr; + list_add_tail(&entry->list, &sin6_list); + } + read_unlock_bh(&in6_dev->lock); + + in6_dev_put(in6_dev); + + list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { + union ib_gid gid; + + rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); + update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); + list_del(&sin6_iter->list); + kfree(sin6_iter); + } +} + +static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, + struct net_device *ndev) +{ + enum_netdev_ipv4_ips(ib_dev, port, ndev); + if (IS_ENABLED(CONFIG_IPV6)) + enum_netdev_ipv6_ips(ib_dev, port, ndev); +} + +static void add_netdev_ips(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = (struct net_device *)cookie; + + enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); + _add_netdev_ips(ib_dev, port, event_ndev); +} + +static void del_netdev_ips(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = (struct net_device *)cookie; + + ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); +} + +static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, + u8 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net *net; + struct net_device *ndev; + + /* Lock the rtnl to make sure the netdevs does not move under + * our feet + */ + rtnl_lock(); + for_each_net(net) + for_each_netdev(net, ndev) + if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) + add_netdev_ips(ib_dev, port, rdma_ndev, ndev); + rtnl_unlock(); +} + +/* This function will rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. */ +int roce_rescan_device(struct ib_device *ib_dev) +{ + ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, + enum_all_gids_of_dev_cb, NULL); + + return 0; +} + +static void callback_for_addr_gid_device_scan(struct ib_device *device, + u8 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct update_gid_event_work *parsed = cookie; + + return update_gid(parsed->gid_op, device, + port, &parsed->gid, + &parsed->gid_attr); +} + +static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, + void *cookie, + void (*handle_netdev)(struct ib_device *ib_dev, + u8 port, + struct net_device *ndev)) +{ + struct net_device *ndev = (struct net_device *)cookie; + struct upper_list { + struct list_head list; + struct net_device *upper; + }; + struct net_device *upper; + struct list_head *iter; + struct upper_list *upper_iter; + struct upper_list *upper_temp; + LIST_HEAD(upper_list); + + rcu_read_lock(); + netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) { + struct upper_list *entry = kmalloc(sizeof(*entry), + GFP_ATOMIC); + + if (!entry) { + pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n"); + continue; + } + + list_add_tail(&entry->list, &upper_list); + dev_hold(upper); + entry->upper = upper; + } + rcu_read_unlock(); + + handle_netdev(ib_dev, port, ndev); + list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, + list) { + handle_netdev(ib_dev, port, upper_iter->upper); + dev_put(upper_iter->upper); + list_del(&upper_iter->list); + kfree(upper_iter); + } +} + +static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *event_ndev) +{ + ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); +} + +static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); +} + +static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); +} + +static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net_device *master_ndev; + + rcu_read_lock(); + master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); + if (master_ndev) + dev_hold(master_ndev); + rcu_read_unlock(); + + if (master_ndev) { + bond_delete_netdev_default_gids(ib_dev, port, master_ndev, + rdma_ndev); + dev_put(master_ndev); + } +} + +static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = (struct net_device *)cookie; + + bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); +} + +/* The following functions operate on all IB devices. netdevice_event and + * addr_event execute ib_enum_all_roce_netdevs through a work. + * ib_enum_all_roce_netdevs iterates through all IB devices. + */ + +static void netdevice_event_work_handler(struct work_struct *_work) +{ + struct netdev_event_work *work = + container_of(_work, struct netdev_event_work, work); + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { + ib_enum_all_roce_netdevs(work->cmds[i].filter, + work->cmds[i].filter_ndev, + work->cmds[i].cb, + work->cmds[i].ndev); + dev_put(work->cmds[i].ndev); + dev_put(work->cmds[i].filter_ndev); + } + + kfree(work); +} + +static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, + struct net_device *ndev) +{ + unsigned int i; + struct netdev_event_work *ndev_work = + kmalloc(sizeof(*ndev_work), GFP_KERNEL); + + if (!ndev_work) { + pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n"); + return NOTIFY_DONE; + } + + memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); + for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { + if (!ndev_work->cmds[i].ndev) + ndev_work->cmds[i].ndev = ndev; + if (!ndev_work->cmds[i].filter_ndev) + ndev_work->cmds[i].filter_ndev = ndev; + dev_hold(ndev_work->cmds[i].ndev); + dev_hold(ndev_work->cmds[i].filter_ndev); + } + INIT_WORK(&ndev_work->work, netdevice_event_work_handler); + + queue_work(ib_wq, &ndev_work->work); + + return NOTIFY_DONE; +} + +static const struct netdev_event_work_cmd add_cmd = { + .cb = add_netdev_ips, .filter = is_eth_port_of_netdev}; +static const struct netdev_event_work_cmd add_cmd_upper_ips = { + .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev}; + +static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + static const struct netdev_event_work_cmd upper_ips_del_cmd = { + .cb = del_netdev_upper_ips, .filter = upper_device_filter}; + static const struct netdev_event_work_cmd bonding_default_del_cmd = { + .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave}; + + if (changeupper_info->linking == false) { + cmds[0] = upper_ips_del_cmd; + cmds[0].ndev = changeupper_info->upper_dev; + cmds[1] = add_cmd; + } else { + cmds[0] = bonding_default_del_cmd; + cmds[0].ndev = changeupper_info->upper_dev; + cmds[1] = add_cmd_upper_ips; + cmds[1].ndev = changeupper_info->upper_dev; + cmds[1].filter_ndev = changeupper_info->upper_dev; + } +} + +static int netdevice_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + static const struct netdev_event_work_cmd del_cmd = { + .cb = del_netdev_ips, .filter = pass_all_filter}; + static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { + .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave}; + static const struct netdev_event_work_cmd default_del_cmd = { + .cb = del_netdev_default_ips, .filter = pass_all_filter}; + static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { + .cb = del_netdev_upper_ips, .filter = upper_device_filter}; + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; + + if (ndev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UP: + cmds[0] = bonding_default_del_cmd_join; + cmds[1] = add_cmd; + break; + + case NETDEV_UNREGISTER: + if (ndev->reg_state < NETREG_UNREGISTERED) + cmds[0] = del_cmd; + else + return NOTIFY_DONE; + break; + + case NETDEV_CHANGEADDR: + cmds[0] = default_del_cmd; + cmds[1] = add_cmd; + break; + + case NETDEV_CHANGEUPPER: + netdevice_event_changeupper( + container_of(ptr, struct netdev_notifier_changeupper_info, info), + cmds); + break; + + case NETDEV_BONDING_FAILOVER: + cmds[0] = bonding_event_ips_del_cmd; + cmds[1] = bonding_default_del_cmd_join; + cmds[2] = add_cmd_upper_ips; + break; + + default: + return NOTIFY_DONE; + } + + return netdevice_queue_work(cmds, ndev); +} + +static void update_gid_event_work_handler(struct work_struct *_work) +{ + struct update_gid_event_work *work = + container_of(_work, struct update_gid_event_work, work); + + ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev, + callback_for_addr_gid_device_scan, work); + + dev_put(work->gid_attr.ndev); + kfree(work); +} + +static int addr_event(struct notifier_block *this, unsigned long event, + struct sockaddr *sa, struct net_device *ndev) +{ + struct update_gid_event_work *work; + enum gid_op_type gid_op; + + if (ndev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + gid_op = GID_ADD; + break; + + case NETDEV_DOWN: + gid_op = GID_DEL; + break; + + default: + return NOTIFY_DONE; + } + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); + return NOTIFY_DONE; + } + + INIT_WORK(&work->work, update_gid_event_work_handler); + + rdma_ip2gid(sa, &work->gid); + work->gid_op = gid_op; + + memset(&work->gid_attr, 0, sizeof(work->gid_attr)); + dev_hold(ndev); + work->gid_attr.ndev = ndev; + + queue_work(ib_wq, &work->work); + + return NOTIFY_DONE; +} + +static int inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct sockaddr_in in; + struct net_device *ndev; + struct in_ifaddr *ifa = ptr; + + in.sin_family = AF_INET; + in.sin_addr.s_addr = ifa->ifa_address; + ndev = ifa->ifa_dev->dev; + + return addr_event(this, event, (struct sockaddr *)&in, ndev); +} + +static int inet6addr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct sockaddr_in6 in6; + struct net_device *ndev; + struct inet6_ifaddr *ifa6 = ptr; + + in6.sin6_family = AF_INET6; + in6.sin6_addr = ifa6->addr; + ndev = ifa6->idev->dev; + + return addr_event(this, event, (struct sockaddr *)&in6, ndev); +} + +static struct notifier_block nb_netdevice = { + .notifier_call = netdevice_event +}; + +static struct notifier_block nb_inetaddr = { + .notifier_call = inetaddr_event +}; + +static struct notifier_block nb_inet6addr = { + .notifier_call = inet6addr_event +}; + +int __init roce_gid_mgmt_init(void) +{ + register_inetaddr_notifier(&nb_inetaddr); + if (IS_ENABLED(CONFIG_IPV6)) + register_inet6addr_notifier(&nb_inet6addr); + /* We relay on the netdevice notifier to enumerate all + * existing devices in the system. Register to this notifier + * last to make sure we will not miss any IP add/del + * callbacks. + */ + register_netdevice_notifier(&nb_netdevice); + + return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ + if (IS_ENABLED(CONFIG_IPV6)) + unregister_inet6addr_notifier(&nb_inet6addr); + unregister_inetaddr_notifier(&nb_inetaddr); + unregister_netdevice_notifier(&nb_netdevice); + /* Ensure all gid deletion tasks complete before we go down, + * to avoid any reference to free'd memory. By the time + * ib-core is removed, all physical devices have been removed, + * so no issue with remaining hardware contexts. + */ +} diff --git a/kernel/drivers/infiniband/core/sa_query.c b/kernel/drivers/infiniband/core/sa_query.c index c38f030f0..a95a32ba5 100644 --- a/kernel/drivers/infiniband/core/sa_query.c +++ b/kernel/drivers/infiniband/core/sa_query.c @@ -45,12 +45,21 @@ #include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_netlink.h> +#include <net/netlink.h> +#include <uapi/rdma/ib_user_sa.h> +#include <rdma/ib_marshall.h> #include "sa.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); MODULE_LICENSE("Dual BSD/GPL"); +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; + struct ib_sa_sm_ah { struct ib_ah *ah; struct kref ref; @@ -80,8 +89,16 @@ struct ib_sa_query { struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; + u32 flags; + struct list_head list; /* Local svc request list */ + u32 seq; /* Local svc request sequence number */ + unsigned long timeout; /* Local svc timeout */ + u8 path_use; /* How will the pathrecord be used */ }; +#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 +#define IB_SA_CANCEL 0x00000002 + struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); void *context; @@ -106,8 +123,28 @@ struct ib_sa_mcmember_query { struct ib_sa_query sa_query; }; +static LIST_HEAD(ib_nl_request_list); +static DEFINE_SPINLOCK(ib_nl_request_lock); +static atomic_t ib_nl_sa_request_seq; +static struct workqueue_struct *ib_nl_wq; +static struct delayed_work ib_nl_timed_work; +static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY, + .len = sizeof(struct ib_path_rec_data)}, + [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32}, + [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64}, + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, + [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, + [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8}, + [LS_NLA_TYPE_PKEY] = {.type = NLA_U16}, + [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16}, +}; + + static void ib_sa_add_one(struct ib_device *device); -static void ib_sa_remove_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { .name = "sa", @@ -381,6 +418,429 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) +{ + query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; +} + +static inline int ib_sa_query_cancelled(struct ib_sa_query *query) +{ + return (query->flags & IB_SA_CANCEL); +} + +static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, + struct ib_sa_query *query) +{ + struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1]; + struct ib_sa_mad *mad = query->mad_buf->mad; + ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask; + u16 val16; + u64 val64; + struct rdma_ls_resolve_header *header; + + query->mad_buf->context[1] = NULL; + + /* Construct the family header first */ + header = (struct rdma_ls_resolve_header *) + skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + memcpy(header->device_name, query->port->agent->device->name, + LS_DEVICE_NAME_MAX); + header->port_num = query->port->port_num; + + if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && + sa_rec->reversible != 0) + query->path_use = LS_RESOLVE_PATH_USE_GMP; + else + query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; + header->path_use = query->path_use; + + /* Now build the attributes */ + if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) { + val64 = be64_to_cpu(sa_rec->service_id); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID, + sizeof(val64), &val64); + } + if (comp_mask & IB_SA_PATH_REC_DGID) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID, + sizeof(sa_rec->dgid), &sa_rec->dgid); + if (comp_mask & IB_SA_PATH_REC_SGID) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID, + sizeof(sa_rec->sgid), &sa_rec->sgid); + if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS, + sizeof(sa_rec->traffic_class), &sa_rec->traffic_class); + + if (comp_mask & IB_SA_PATH_REC_PKEY) { + val16 = be16_to_cpu(sa_rec->pkey); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY, + sizeof(val16), &val16); + } + if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) { + val16 = be16_to_cpu(sa_rec->qos_class); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS, + sizeof(val16), &val16); + } +} + +static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) +{ + int len = 0; + + if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) + len += nla_total_size(sizeof(u64)); + if (comp_mask & IB_SA_PATH_REC_DGID) + len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); + if (comp_mask & IB_SA_PATH_REC_SGID) + len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); + if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) + len += nla_total_size(sizeof(u8)); + if (comp_mask & IB_SA_PATH_REC_PKEY) + len += nla_total_size(sizeof(u16)); + if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) + len += nla_total_size(sizeof(u16)); + + /* + * Make sure that at least some of the required comp_mask bits are + * set. + */ + if (WARN_ON(len == 0)) + return len; + + /* Add the family header */ + len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); + + return len; +} + +static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *data; + int ret = 0; + struct ib_sa_mad *mad; + int len; + + mad = query->mad_buf->mad; + len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); + if (len <= 0) + return -EMSGSIZE; + + skb = nlmsg_new(len, gfp_mask); + if (!skb) + return -ENOMEM; + + /* Put nlmsg header only for now */ + data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); + if (!data) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* Add attributes */ + ib_nl_set_path_rec_attrs(skb, query); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); + if (!ret) + ret = len; + else + ret = 0; + + return ret; +} + +static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) +{ + unsigned long flags; + unsigned long delay; + int ret; + + INIT_LIST_HEAD(&query->list); + query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + + /* Put the request on the list first.*/ + spin_lock_irqsave(&ib_nl_request_lock, flags); + delay = msecs_to_jiffies(sa_local_svc_timeout_ms); + query->timeout = delay + jiffies; + list_add_tail(&query->list, &ib_nl_request_list); + /* Start the timeout if this is the only request */ + if (ib_nl_request_list.next == &query->list) + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + ret = ib_nl_send_msg(query, gfp_mask); + if (ret <= 0) { + ret = -EIO; + /* Remove the request */ + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_del(&query->list); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + } else { + ret = 0; + } + + return ret; +} + +static int ib_nl_cancel_request(struct ib_sa_query *query) +{ + unsigned long flags; + struct ib_sa_query *wait_query; + int found = 0; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(wait_query, &ib_nl_request_list, list) { + /* Let the timeout to take care of the callback */ + if (query == wait_query) { + query->flags |= IB_SA_CANCEL; + query->timeout = jiffies; + list_move(&query->list, &ib_nl_request_list); + found = 1; + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); + break; + } + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + return found; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc); + +static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, + const struct nlmsghdr *nlh) +{ + struct ib_mad_send_wc mad_send_wc; + struct ib_sa_mad *mad = NULL; + const struct nlattr *head, *curr; + struct ib_path_rec_data *rec; + int len, rem; + u32 mask = 0; + int status = -EIO; + + if (query->callback) { + head = (const struct nlattr *) nlmsg_data(nlh); + len = nlmsg_len(nlh); + switch (query->path_use) { + case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: + mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; + break; + + case LS_RESOLVE_PATH_USE_ALL: + case LS_RESOLVE_PATH_USE_GMP: + default: + mask = IB_PATH_PRIMARY | IB_PATH_GMP | + IB_PATH_BIDIRECTIONAL; + break; + } + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) { + rec = nla_data(curr); + /* + * Get the first one. In the future, we may + * need to get up to 6 pathrecords. + */ + if ((rec->flags & mask) == mask) { + mad = query->mad_buf->mad; + mad->mad_hdr.method |= + IB_MGMT_METHOD_RESP; + memcpy(mad->data, rec->path_rec, + sizeof(rec->path_rec)); + status = 0; + break; + } + } + } + query->callback(query, status, mad); + } + + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_SUCCESS; + send_handler(query->mad_buf->mad_agent, &mad_send_wc); +} + +static void ib_nl_request_timeout(struct work_struct *work) +{ + unsigned long flags; + struct ib_sa_query *query; + unsigned long delay; + struct ib_mad_send_wc mad_send_wc; + int ret; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + while (!list_empty(&ib_nl_request_list)) { + query = list_entry(ib_nl_request_list.next, + struct ib_sa_query, list); + + if (time_after(query->timeout, jiffies)) { + delay = query->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + break; + } + + list_del(&query->list); + ib_sa_disable_local_svc(query); + /* Hold the lock to protect against query cancellation */ + if (ib_sa_query_cancelled(query)) + ret = -1; + else + ret = ib_post_send_mad(query->mad_buf, NULL); + if (ret) { + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + send_handler(query->port->agent, &mad_send_wc); + spin_lock_irqsave(&ib_nl_request_lock, flags); + } + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); +} + +static int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; + int timeout, delta, abs_delta; + const struct nlattr *attr; + unsigned long flags; + struct ib_sa_query *query; + long delay = 0; + struct nlattr *tb[LS_NLA_TYPE_MAX]; + int ret; + + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy); + attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; + if (ret || !attr) + goto settimeout_out; + + timeout = *(int *) nla_data(attr); + if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN) + timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN; + if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) + timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; + + delta = timeout - sa_local_svc_timeout_ms; + if (delta < 0) + abs_delta = -delta; + else + abs_delta = delta; + + if (delta != 0) { + spin_lock_irqsave(&ib_nl_request_lock, flags); + sa_local_svc_timeout_ms = timeout; + list_for_each_entry(query, &ib_nl_request_list, list) { + if (delta < 0 && abs_delta > query->timeout) + query->timeout = 0; + else + query->timeout += delta; + + /* Get the new delay from the first entry */ + if (!delay) { + delay = query->timeout - jiffies; + if (delay <= 0) + delay = 1; + } + } + if (delay) + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, + (unsigned long)delay); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + } + +settimeout_out: + return skb->len; +} + +static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX]; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return 0; + + ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy); + if (ret) + return 0; + + return 1; +} + +static int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; + unsigned long flags; + struct ib_sa_query *query; + struct ib_mad_send_buf *send_buf; + struct ib_mad_send_wc mad_send_wc; + int found = 0; + int ret; + + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(query, &ib_nl_request_list, list) { + /* + * If the query is cancelled, let the timeout routine + * take care of it. + */ + if (nlh->nlmsg_seq == query->seq) { + found = !ib_sa_query_cancelled(query); + if (found) + list_del(&query->list); + break; + } + } + + if (!found) { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + goto resp_out; + } + + send_buf = query->mad_buf; + + if (!ib_nl_is_good_resolve_resp(nlh)) { + /* if the result is a failure, send out the packet via IB */ + ib_sa_disable_local_svc(query); + ret = ib_post_send_mad(query->mad_buf, NULL); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + if (ret) { + mad_send_wc.send_buf = send_buf; + mad_send_wc.status = IB_WC_GENERAL_ERR; + send_handler(query->port->agent, &mad_send_wc); + } + } else { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + ib_nl_process_good_resolve_rsp(query, nlh); + } + +resp_out: + return skb->len; +} + +static struct ibnl_client_cbs ib_sa_cb_table[] = { + [RDMA_NL_LS_OP_RESOLVE] = { + .dump = ib_nl_handle_resolve_resp, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_SET_TIMEOUT] = { + .dump = ib_nl_handle_set_timeout, + .module = THIS_MODULE }, +}; + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -450,7 +910,7 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; - if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(handler->device, port->port_num)) return; spin_lock_irqsave(&port->ah_lock, flags); @@ -502,7 +962,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) mad_buf = query->mad_buf; spin_unlock_irqrestore(&idr_lock, flags); - ib_cancel_mad(agent, mad_buf); + /* + * If the query is still on the netlink request list, schedule + * it to be cancelled by the timeout routine. Otherwise, it has been + * sent to the MAD layer and has to be cancelled from there. + */ + if (!ib_nl_cancel_request(query)) + ib_cancel_mad(agent, mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); @@ -540,29 +1006,32 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + force_grh = rdma_cap_eth_ah(device, port_num); if (rec->hop_limit > 1 || force_grh) { + struct net_device *ndev = ib_get_ndev_from_path(rec); + ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, &port_num, + ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num, &gid_index); - if (ret) + if (ret) { + if (ndev) + dev_put(ndev); return ret; + } ah_attr->grh.sgid_index = gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; + if (ndev) + dev_put(ndev); } if (force_grh) { memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); - ah_attr->vlan_id = rec->vlan_id; - } else { - ah_attr->vlan_id = 0xffff; } - return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -583,7 +1052,8 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, - gfp_mask); + gfp_mask, + IB_MGMT_BASE_VERSION); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; @@ -618,7 +1088,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; @@ -638,6 +1108,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) query->mad_buf->context[0] = query; query->id = id; + if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { + if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!ib_nl_make_request(query, gfp_mask)) + return id; + } + ib_sa_disable_local_svc(query); + } + ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { spin_lock_irqsave(&idr_lock, flags); @@ -677,9 +1155,9 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); - rec.vlan_id = 0xffff; + rec.net = NULL; + rec.ifindex = 0; memset(rec.dmac, 0, ETH_ALEN); - memset(rec.smac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -739,7 +1217,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -766,6 +1244,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, *sa_query = &query->sa_query; + query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; + query->sa_query.mad_buf->context[1] = rec; + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; @@ -861,7 +1342,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, method != IB_SA_METHOD_DELETE) return -EINVAL; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -953,7 +1434,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -1050,7 +1531,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -1153,16 +1634,10 @@ static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } + s = rdma_start_port(device); + e = rdma_end_port(device); sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), @@ -1175,7 +1650,7 @@ static void ib_sa_add_one(struct ib_device *device) for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); - if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(device, i + 1)) continue; sa_dev->port[i].sm_ah = NULL; @@ -1189,8 +1664,13 @@ static void ib_sa_add_one(struct ib_device *device) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &sa_client, sa_dev); /* @@ -1204,25 +1684,26 @@ static void ib_sa_add_one(struct ib_device *device) if (ib_register_event_handler(&sa_dev->event_handler)) goto err; - for (i = 0; i <= e - s; ++i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + for (i = 0; i <= e - s; ++i) { + if (rdma_cap_ib_sa(device, i + 1)) update_sm_ah(&sa_dev->port[i].update_task); + } return; err: - while (--i >= 0) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + while (--i >= 0) { + if (rdma_cap_ib_sa(device, i + 1)) ib_unregister_mad_agent(sa_dev->port[i].agent); - + } +free: kfree(sa_dev); - return; } -static void ib_sa_remove_one(struct ib_device *device) +static void ib_sa_remove_one(struct ib_device *device, void *client_data) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_device *sa_dev = client_data; int i; if (!sa_dev) @@ -1233,7 +1714,7 @@ static void ib_sa_remove_one(struct ib_device *device) flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_sa(device, i + 1)) { ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); @@ -1250,6 +1731,8 @@ static int __init ib_sa_init(void) get_random_bytes(&tid, sizeof tid); + atomic_set(&ib_nl_sa_request_seq, 0); + ret = ib_register_client(&sa_client); if (ret) { printk(KERN_ERR "Couldn't register ib_sa client\n"); @@ -1262,7 +1745,25 @@ static int __init ib_sa_init(void) goto err2; } + ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq"); + if (!ib_nl_wq) { + ret = -ENOMEM; + goto err3; + } + + if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, + ib_sa_cb_table)) { + pr_err("Failed to add netlink callback\n"); + ret = -EINVAL; + goto err4; + } + INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); + return 0; +err4: + destroy_workqueue(ib_nl_wq); +err3: + mcast_cleanup(); err2: ib_unregister_client(&sa_client); err1: @@ -1271,6 +1772,10 @@ err1: static void __exit ib_sa_cleanup(void) { + ibnl_remove_client(RDMA_NL_LS); + cancel_delayed_work(&ib_nl_timed_work); + flush_workqueue(ib_nl_wq); + destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); diff --git a/kernel/drivers/infiniband/core/smi.c b/kernel/drivers/infiniband/core/smi.c index 5855e4405..f19b23817 100644 --- a/kernel/drivers/infiniband/core/smi.c +++ b/kernel/drivers/infiniband/core/smi.c @@ -5,6 +5,7 @@ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,85 +39,82 @@ #include <rdma/ib_smi.h> #include "smi.h" - -/* - * Fixup a directed route SMP for sending - * Return 0 if the SMP should be discarded - */ -enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num) +#include "opa_smi.h" + +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + const u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) { - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - /* See section 14.2.2.2, Vol 1 IB spec */ /* C14-6 -- valid hop_cnt values are from 0 to 63 */ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) return IB_SMI_DISCARD; - if (!ib_get_smp_direction(smp)) { + if (!direction) { /* C14-9:1 */ - if (hop_cnt && hop_ptr == 0) { - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == + if (hop_cnt && *hop_ptr == 0) { + (*hop_ptr)++; + return (initial_path[*hop_ptr] == port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-9:2 */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) return IB_SMI_DISCARD; - /* smp->return_path set when received */ - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == + /* return_path set when received */ + (*hop_ptr)++; + return (initial_path[*hop_ptr] == port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { - /* smp->return_path set when received */ - smp->hop_ptr++; - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? + if (*hop_ptr == hop_cnt) { + /* return_path set when received */ + (*hop_ptr)++; + return (is_switch || + dr_dlid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ /* C14-9:5 -- Fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); } else { /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) return IB_SMI_DISCARD; - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == + (*hop_ptr)--; + return (return_path[*hop_ptr] == port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:3 -- at the end of the DR segment of path */ - if (hop_ptr == 1) { - smp->hop_ptr--; + if (*hop_ptr == 1) { + (*hop_ptr)--; /* C14-13:3 -- SMPs destined for SM shouldn't be here */ - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_slid == IB_LID_PERMISSIVE ? + return (is_switch || + dr_slid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ - if (hop_ptr == 0) + if (*hop_ptr == 0) return IB_SMI_HANDLE; /* C14-13:5 -- Check for unreasonable hop pointer */ @@ -125,105 +123,163 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, } /* - * Adjust information for a received SMP - * Return 0 if the SMP should be dropped + * Fixup a directed route SMP for sending + * Return IB_SMI_DISCARD if the SMP should be discarded */ -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, - int port_num, int phys_port_cnt) +enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + bool is_switch, int port_num) { - u8 hop_ptr, hop_cnt; + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; +enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, + int phys_port_cnt, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ /* See section 14.2.2.2, Vol 1 IB spec */ /* C14-6 -- valid hop_cnt values are from 0 to 63 */ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) return IB_SMI_DISCARD; - if (!ib_get_smp_direction(smp)) { + if (!direction) { /* C14-9:1 -- sender should have incremented hop_ptr */ - if (hop_cnt && hop_ptr == 0) + if (hop_cnt && *hop_ptr == 0) return IB_SMI_DISCARD; /* C14-9:2 -- intermediate hop */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) return IB_SMI_DISCARD; - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ - return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ? + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + return (initial_path[*hop_ptr+1] <= phys_port_cnt ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { + if (*hop_ptr == hop_cnt) { if (hop_cnt) - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? + return (is_switch || + dr_dlid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ /* C14-9:5 -- fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); } else { /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) return IB_SMI_DISCARD; - /* smp->hop_ptr updated when sending */ - return (smp->return_path[hop_ptr-1] <= phys_port_cnt ? + /* hop_ptr updated when sending */ + return (return_path[*hop_ptr-1] <= phys_port_cnt ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == 1) { - if (smp->dr_slid == IB_LID_PERMISSIVE) { + if (*hop_ptr == 1) { + if (dr_slid_is_permissive) { /* giving SMP to SM - update hop_ptr */ - smp->hop_ptr--; + (*hop_ptr)--; return IB_SMI_HANDLE; } - /* smp->hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH ? - IB_SMI_HANDLE : IB_SMI_DISCARD); + /* hop_ptr updated when sending */ + return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:4 -- hop_ptr = 0 -> give to SM */ /* C14-13:5 -- Check for unreasonable hop pointer */ - return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); } } -enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) { - u8 hop_ptr, hop_cnt; + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} - if (!ib_get_smp_direction(smp)) { +static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + if (!direction) { /* C14-9:2 -- intermediate hop */ if (hop_ptr && hop_ptr < hop_cnt) return IB_SMI_FORWARD; /* C14-9:3 -- at the end of the DR segment of path */ if (hop_ptr == hop_cnt) - return (smp->dr_dlid == IB_LID_PERMISSIVE ? + return (dr_dlid_is_permissive ? IB_SMI_SEND : IB_SMI_LOCAL); /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ @@ -236,10 +292,29 @@ enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) /* C14-13:3 -- at the end of the DR segment of path */ if (hop_ptr == 1) - return (smp->dr_slid != IB_LID_PERMISSIVE ? + return (!dr_slid_is_permissive ? IB_SMI_SEND : IB_SMI_LOCAL); } return IB_SMI_LOCAL; + +} + +enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); } /* @@ -251,3 +326,13 @@ int smi_get_fwd_port(struct ib_smp *smp) return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : smp->return_path[smp->hop_ptr-1]); } + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int opa_smi_get_fwd_port(struct opa_smp *smp) +{ + return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] : + smp->route.dr.return_path[smp->hop_ptr-1]; +} diff --git a/kernel/drivers/infiniband/core/smi.h b/kernel/drivers/infiniband/core/smi.h index aff96bac4..33c91c8a1 100644 --- a/kernel/drivers/infiniband/core/smi.h +++ b/kernel/drivers/infiniband/core/smi.h @@ -51,12 +51,12 @@ enum smi_forward_action { IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ }; -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, int port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num); + bool is_switch, int port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/kernel/drivers/infiniband/core/sysfs.c b/kernel/drivers/infiniband/core/sysfs.c index cbd0383f6..b1f37d409 100644 --- a/kernel/drivers/infiniband/core/sysfs.c +++ b/kernel/drivers/infiniband/core/sysfs.c @@ -289,7 +289,7 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, union ib_gid gid; ssize_t ret; - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); if (ret) return ret; @@ -326,6 +326,8 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, int width = (tab_attr->index >> 16) & 0xff; struct ib_mad *in_mad = NULL; struct ib_mad *out_mad = NULL; + size_t mad_size = sizeof(*out_mad); + u16 out_mad_pkey_index = 0; ssize_t ret; if (!p->ibdev->process_mad) @@ -347,7 +349,10 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, in_mad->data[41] = p->port_num; /* PortSelect field */ if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, in_mad, out_mad) & + p->port_num, NULL, NULL, + (const struct ib_mad_hdr *)in_mad, mad_size, + (struct ib_mad_hdr *)out_mad, &mad_size, + &out_mad_pkey_index) & (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { ret = -EINVAL; @@ -452,28 +457,6 @@ static struct kobj_type port_type = { .default_attrs = port_default_attrs }; -static void ib_device_release(struct device *device) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - kfree(dev); -} - -static int ib_device_uevent(struct device *device, - struct kobj_uevent_env *env) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) - return -ENOMEM; - - /* - * It would be nice to pass the node GUID with the event... - */ - - return 0; -} - static struct attribute ** alloc_group_attrs(ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf), @@ -696,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_desc }; -static struct class ib_class = { - .name = "infiniband", - .dev_release = ib_device_release, - .dev_uevent = ib_device_uevent, -}; - /* Show a given an attribute in the statistics group */ static ssize_t show_protocol_stat(const struct device *device, struct device_attribute *attr, char *buf, @@ -840,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - class_dev->class = &ib_class; - class_dev->parent = device->dma_device; - dev_set_name(class_dev, "%s", device->name); - dev_set_drvdata(class_dev, device); - - INIT_LIST_HEAD(&device->port_list); + device->dev.parent = device->dma_device; + ret = dev_set_name(class_dev, "%s", device->name); + if (ret) + return ret; - ret = device_register(class_dev); + ret = device_add(class_dev); if (ret) goto err; @@ -864,7 +839,7 @@ int ib_device_register_sysfs(struct ib_device *device, goto err_put; } - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { ret = add_port(device, 0, port_callback); if (ret) goto err_put; @@ -910,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device) device_unregister(&device->dev); } - -int ib_sysfs_setup(void) -{ - return class_register(&ib_class); -} - -void ib_sysfs_cleanup(void) -{ - class_unregister(&ib_class); -} diff --git a/kernel/drivers/infiniband/core/ucm.c b/kernel/drivers/infiniband/core/ucm.c index f2f63933e..6b4e8a008 100644 --- a/kernel/drivers/infiniband/core/ucm.c +++ b/kernel/drivers/infiniband/core/ucm.c @@ -109,7 +109,7 @@ enum { #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device); +static void ib_ucm_remove_one(struct ib_device *device, void *client_data); static struct ib_client ucm_client = { .name = "ucm", @@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file, if (result) goto out; - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, - NULL); + result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); out: ib_ucm_ctx_put(ctx); return result; @@ -1193,6 +1192,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1202,7 +1202,7 @@ static void ib_ucm_release_dev(struct device *dev) if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) clear_bit(ucm_dev->devnum, dev_map); else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); kfree(ucm_dev); } @@ -1226,7 +1226,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static int find_overflow_devnum(void) { int ret; @@ -1253,8 +1252,7 @@ static void ib_ucm_add_one(struct ib_device *device) dev_t base; struct ib_ucm_device *ucm_dev; - if (!device->alloc_ucontext || - rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1)) return; ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); @@ -1311,9 +1309,9 @@ err: return; } -static void ib_ucm_remove_one(struct ib_device *device) +static void ib_ucm_remove_one(struct ib_device *device, void *client_data) { - struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); + struct ib_ucm_device *ucm_dev = client_data; if (!ucm_dev) return; diff --git a/kernel/drivers/infiniband/core/ucma.c b/kernel/drivers/infiniband/core/ucma.c index 45d67e922..8b5a934e1 100644 --- a/kernel/drivers/infiniband/core/ucma.c +++ b/kernel/drivers/infiniband/core/ucma.c @@ -42,6 +42,7 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/module.h> +#include <linux/nsproxy.h> #include <rdma/rdma_user_cm.h> #include <rdma/ib_marshall.h> @@ -74,6 +75,7 @@ struct ucma_file { struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; + struct workqueue_struct *close_wq; }; struct ucma_context { @@ -89,6 +91,13 @@ struct ucma_context { struct list_head list; struct list_head mc_list; + /* mark that device is in process of destroying the internal HW + * resources, protected by the global mut + */ + int closing; + /* sync between removal event and id destroy, protected by file mut */ + int destroying; + struct work_struct close_work; }; struct ucma_multicast { @@ -107,6 +116,7 @@ struct ucma_event { struct list_head list; struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; + struct work_struct close_work; }; static DEFINE_MUTEX(mut); @@ -132,8 +142,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) mutex_lock(&mut); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) - atomic_inc(&ctx->ref); + if (!IS_ERR(ctx)) { + if (ctx->closing) + ctx = ERR_PTR(-EIO); + else + atomic_inc(&ctx->ref); + } mutex_unlock(&mut); return ctx; } @@ -144,6 +158,28 @@ static void ucma_put_ctx(struct ucma_context *ctx) complete(&ctx->comp); } +static void ucma_close_event_id(struct work_struct *work) +{ + struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); + + rdma_destroy_id(uevent_close->cm_id); + kfree(uevent_close); +} + +static void ucma_close_id(struct work_struct *work) +{ + struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); + + /* once all inflight tasks are finished, we close all underlying + * resources. The context is still alive till its explicit destryoing + * by its creator. + */ + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); +} + static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; @@ -152,6 +188,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) if (!ctx) return NULL; + INIT_WORK(&ctx->close_work, ucma_close_id); atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); @@ -242,6 +279,44 @@ static void ucma_set_event_context(struct ucma_context *ctx, } } +/* Called with file->mut locked for the relevant context. */ +static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +{ + struct ucma_context *ctx = cm_id->context; + struct ucma_event *con_req_eve; + int event_found = 0; + + if (ctx->destroying) + return; + + /* only if context is pointing to cm_id that it owns it and can be + * queued to be closed, otherwise that cm_id is an inflight one that + * is part of that context event list pending to be detached and + * reattached to its new context as part of ucma_get_event, + * handled separately below. + */ + if (ctx->cm_id == cm_id) { + mutex_lock(&mut); + ctx->closing = 1; + mutex_unlock(&mut); + queue_work(ctx->file->close_wq, &ctx->close_work); + return; + } + + list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { + if (con_req_eve->cm_id == cm_id && + con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { + list_del(&con_req_eve->list); + INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); + queue_work(ctx->file->close_wq, &con_req_eve->close_work); + event_found = 1; + break; + } + } + if (!event_found) + printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n"); +} + static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -276,14 +351,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a * new connection before the user accepts it. This is okay, - * since the accept will just fail later. + * since the accept will just fail later. However, we do need + * to release the underlying HW resources in case of a device + * removal event. */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); + kfree(uevent); goto out; } list_add_tail(&uevent->list, &ctx->file->event_list); wake_up_interruptible(&ctx->file->poll_wait); + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); out: mutex_unlock(&ctx->file->mut); return ret; @@ -391,7 +473,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = rdma_create_id(current->nsproxy->net_ns, + ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; @@ -442,9 +525,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) } /* - * We cannot hold file->mut when calling rdma_destroy_id() or we can - * deadlock. We also acquire file->mut in ucma_event_handler(), and - * rdma_destroy_id() will wait until all callbacks have completed. + * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At + * this point, no new events will be reported from the hardware. However, we + * still need to cleanup the UCMA context for this ID. Specifically, there + * might be events that have not yet been consumed by the user space software. + * These might include pending connect requests which we have not completed + * processing. We cannot call rdma_destroy_id while holding the lock of the + * context (file->mut), as it might cause a deadlock. We therefore extract all + * relevant events from the context pending events list while holding the + * mutex. After that we release them as needed. */ static int ucma_free_ctx(struct ucma_context *ctx) { @@ -452,8 +541,6 @@ static int ucma_free_ctx(struct ucma_context *ctx) struct ucma_event *uevent, *tmp; LIST_HEAD(list); - /* No new events will be generated after destroying the id. */ - rdma_destroy_id(ctx->cm_id); ucma_cleanup_multicast(ctx); @@ -501,10 +588,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - resp.events_reported = ucma_free_ctx(ctx); + mutex_lock(&ctx->file->mut); + ctx->destroying = 1; + mutex_unlock(&ctx->file->mut); + flush_workqueue(ctx->file->close_wq); + /* At this point it's guaranteed that there is no inflight + * closing task */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + + resp.events_reported = ucma_free_ctx(ctx); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; @@ -722,26 +823,13 @@ static ssize_t ucma_query_route(struct ucma_file *file, resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(ctx->cm_id->device, - ctx->cm_id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ucma_copy_ib_route(&resp, &ctx->cm_id->route); - break; - case IB_LINK_LAYER_ETHERNET: - ucma_copy_iboe_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } - break; - case RDMA_TRANSPORT_IWARP: + + if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } out: if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -1125,7 +1213,6 @@ static int ucma_set_ib_path(struct ucma_context *ctx, return -EINVAL; memset(&sa_path, 0, sizeof(sa_path)); - sa_path.vlan_id = 0xffff; ib_sa_unpack_path(path_data->path_rec, &sa_path); ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); @@ -1334,10 +1421,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = ERR_PTR(-ENOENT); else if (mc->ctx->file != file) mc = ERR_PTR(-EINVAL); - else { + else if (!atomic_inc_not_zero(&mc->ctx->ref)) + mc = ERR_PTR(-ENXIO); + else idr_remove(&multicast_idr, mc->id); - atomic_inc(&mc->ctx->ref); - } mutex_unlock(&mut); if (IS_ERR(mc)) { @@ -1367,10 +1454,10 @@ static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) /* Acquire mutex's based on pointer comparison to prevent deadlock. */ if (file1 < file2) { mutex_lock(&file1->mut); - mutex_lock(&file2->mut); + mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); } else { mutex_lock(&file2->mut); - mutex_lock(&file1->mut); + mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); } } @@ -1538,6 +1625,12 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; + file->close_wq = create_singlethread_workqueue("ucma_close_id"); + if (!file->close_wq) { + kfree(file); + return -ENOMEM; + } + INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1556,16 +1649,34 @@ static int ucma_close(struct inode *inode, struct file *filp) mutex_lock(&file->mut); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { + ctx->destroying = 1; mutex_unlock(&file->mut); mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); + flush_workqueue(file->close_wq); + /* At that step once ctx was marked as destroying and workqueue + * was flushed we are safe from any inflights handlers that + * might put other closing task. + */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + /* rdma_destroy_id ensures that no event handlers are + * inflight for that id before releasing it. + */ + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + ucma_free_ctx(ctx); mutex_lock(&file->mut); } mutex_unlock(&file->mut); + destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1629,6 +1740,7 @@ static void __exit ucma_cleanup(void) device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); + idr_destroy(&multicast_idr); } module_init(ucma_init); diff --git a/kernel/drivers/infiniband/core/user_mad.c b/kernel/drivers/infiniband/core/user_mad.c index 928cdd20e..57f281f8d 100644 --- a/kernel/drivers/infiniband/core/user_mad.c +++ b/kernel/drivers/infiniband/core/user_mad.c @@ -99,7 +99,6 @@ struct ib_umad_port { }; struct ib_umad_device { - int start_port, end_port; struct kobject kobj; struct ib_umad_port port[0]; }; @@ -134,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); -static void ib_umad_remove_one(struct ib_device *device); +static void ib_umad_remove_one(struct ib_device *device, void *client_data); static void ib_umad_release_dev(struct kobject *kobj) { @@ -263,20 +262,23 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, { struct ib_mad_recv_buf *recv_buf; int left, seg_payload, offset, max_seg_payload; + size_t seg_size; - /* We need enough room to copy the first (or only) MAD segment. */ recv_buf = &packet->recv_wc->recv_buf; - if ((packet->length <= sizeof (*recv_buf->mad) && + seg_size = packet->recv_wc->mad_seg_size; + + /* We need enough room to copy the first (or only) MAD segment. */ + if ((packet->length <= seg_size && count < hdr_size(file) + packet->length) || - (packet->length > sizeof (*recv_buf->mad) && - count < hdr_size(file) + sizeof (*recv_buf->mad))) + (packet->length > seg_size && + count < hdr_size(file) + seg_size)) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); - seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); + seg_payload = min_t(int, packet->length, seg_size); if (copy_to_user(buf, recv_buf->mad, seg_payload)) return -EFAULT; @@ -293,7 +295,7 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, return -ENOSPC; } offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); - max_seg_payload = sizeof (struct ib_mad) - offset; + max_seg_payload = seg_size - offset; for (left = packet->length - seg_payload, buf += seg_payload; left; left -= seg_payload, buf += seg_payload) { @@ -426,11 +428,11 @@ static int is_duplicate(struct ib_umad_file *file, * the same TID, reject the second as a duplicate. This is more * restrictive than required by the spec. */ - if (!ib_response_mad((struct ib_mad *) hdr)) { - if (!ib_response_mad((struct ib_mad *) sent_hdr)) + if (!ib_response_mad(hdr)) { + if (!ib_response_mad(sent_hdr)) return 1; continue; - } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + } else if (!ib_response_mad(sent_hdr)) continue; if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) @@ -451,6 +453,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct ib_rmpp_mad *rmpp_mad; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; + u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; @@ -517,11 +520,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, rmpp_active = 0; } + base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; data_len = count - hdr_size(file) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, - hdr_len, data_len, GFP_KERNEL); + hdr_len, data_len, GFP_KERNEL, + base_version); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; @@ -1273,16 +1278,10 @@ static void ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } + s = rdma_start_port(device); + e = rdma_end_port(device); umad_dev = kzalloc(sizeof *umad_dev + (e - s + 1) * sizeof (struct ib_umad_port), @@ -1292,38 +1291,49 @@ static void ib_umad_add_one(struct ib_device *device) kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); - umad_dev->start_port = s; - umad_dev->end_port = e; - for (i = s; i <= e; ++i) { + if (!rdma_cap_ib_mad(device, i)) + continue; + umad_dev->port[i - s].umad_dev = umad_dev; if (ib_umad_init_port(device, i, umad_dev, &umad_dev->port[i - s])) goto err; + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &umad_client, umad_dev); return; err: - while (--i >= s) - ib_umad_kill_port(&umad_dev->port[i - s]); + while (--i >= s) { + if (!rdma_cap_ib_mad(device, i)) + continue; + ib_umad_kill_port(&umad_dev->port[i - s]); + } +free: kobject_put(&umad_dev->kobj); } -static void ib_umad_remove_one(struct ib_device *device) +static void ib_umad_remove_one(struct ib_device *device, void *client_data) { - struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); + struct ib_umad_device *umad_dev = client_data; int i; if (!umad_dev) return; - for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) - ib_umad_kill_port(&umad_dev->port[i]); + for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { + if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) + ib_umad_kill_port(&umad_dev->port[i]); + } kobject_put(&umad_dev->kobj); } diff --git a/kernel/drivers/infiniband/core/uverbs.h b/kernel/drivers/infiniband/core/uverbs.h index bebf11a66..94bbd8c15 100644 --- a/kernel/drivers/infiniband/core/uverbs.h +++ b/kernel/drivers/infiniband/core/uverbs.h @@ -89,12 +89,16 @@ struct ib_uverbs_device { int num_comp_vectors; struct completion comp; struct device *dev; - struct ib_device *ib_dev; + struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; struct kobject kobj; + struct srcu_struct disassociate_srcu; + struct mutex lists_mutex; /* protect lists */ + struct list_head uverbs_file_list; + struct list_head uverbs_events_file_list; }; struct ib_uverbs_event_file { @@ -106,6 +110,7 @@ struct ib_uverbs_event_file { wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + struct list_head list; }; struct ib_uverbs_file { @@ -115,6 +120,8 @@ struct ib_uverbs_file { struct ib_ucontext *ucontext; struct ib_event_handler event_handler; struct ib_uverbs_event_file *async_file; + struct list_head list; + int is_closed; }; struct ib_uverbs_event { @@ -178,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async); +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, @@ -213,6 +222,7 @@ struct ib_uverbs_flow_spec { #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ const char __user *buf, int in_len, \ int out_len) @@ -254,11 +264,14 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); #define IB_UVERBS_DECLARE_EX_CMD(name) \ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ struct ib_udata *ucore, \ struct ib_udata *uhw) IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); IB_UVERBS_DECLARE_EX_CMD(query_device); +IB_UVERBS_DECLARE_EX_CMD(create_cq); +IB_UVERBS_DECLARE_EX_CMD(create_qp); #endif /* UVERBS_H */ diff --git a/kernel/drivers/infiniband/core/uverbs_cmd.c b/kernel/drivers/infiniband/core/uverbs_cmd.c index ccc2494b4..1c02deab0 100644 --- a/kernel/drivers/infiniband/core/uverbs_cmd.c +++ b/kernel/drivers/infiniband/core/uverbs_cmd.c @@ -62,9 +62,11 @@ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. @@ -96,7 +98,7 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) @@ -145,7 +147,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) @@ -153,7 +155,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } @@ -282,13 +284,13 @@ static void put_xrcd_read(struct ib_uobject *uobj) } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; - struct ib_device *ibdev = file->device->ib_dev; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_device_attr dev_attr; #endif @@ -313,13 +315,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - ucontext = ibdev->alloc_ucontext(ibdev, &udata); + ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto err; } - ucontext->device = ibdev; + ucontext->device = ib_dev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); @@ -340,7 +342,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); - ret = ib_query_device(ibdev, &dev_attr); + ret = ib_query_device(ib_dev, &dev_attr); if (ret) goto err_free; if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) @@ -355,7 +357,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err_free; resp.async_fd = ret; - filp = ib_uverbs_alloc_event_file(file, 1); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; @@ -367,16 +369,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err_file; } - file->async_file = filp->private_data; - - INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, - ib_uverbs_event_handler); - ret = ib_register_event_handler(&file->event_handler); - if (ret) - goto err_file; - - kref_get(&file->async_file->ref); - kref_get(&file->ref); file->ucontext = ucontext; fd_install(resp.async_fd, filp); @@ -386,6 +378,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, return in_len; err_file: + ib_uverbs_free_async_event_file(file); fput(filp); err_fd: @@ -393,7 +386,7 @@ err_fd: err_free: put_pid(ucontext->tgid); - ibdev->dealloc_ucontext(ucontext); + ib_dev->dealloc_ucontext(ucontext); err: mutex_unlock(&file->mutex); @@ -401,11 +394,12 @@ err: } static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_uverbs_query_device_resp *resp, struct ib_device_attr *attr) { resp->fw_ver = attr->fw_ver; - resp->node_guid = file->device->ib_dev->node_guid; + resp->node_guid = ib_dev->node_guid; resp->sys_image_guid = attr->sys_image_guid; resp->max_mr_size = attr->max_mr_size; resp->page_size_cap = attr->page_size_cap; @@ -443,10 +437,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file, resp->max_srq_sge = attr->max_srq_sge; resp->max_pkeys = attr->max_pkeys; resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; + resp->phys_port_cnt = ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -461,12 +456,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(file->device->ib_dev, &attr); + ret = ib_query_device(ib_dev, &attr); if (ret) return ret; memset(&resp, 0, sizeof resp); - copy_query_dev_fields(file, &resp, &attr); + copy_query_dev_fields(file, ib_dev, &resp, &attr); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -476,6 +471,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, } ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -490,7 +486,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); + ret = ib_query_port(ib_dev, cmd.port_num, &attr); if (ret) return ret; @@ -515,7 +511,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; - resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + resp.link_layer = rdma_port_get_link_layer(ib_dev, cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -526,6 +522,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -553,15 +550,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); - pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, - file->ucontext, &udata); + pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; } - pd->device = file->device->ib_dev; + pd->device = ib_dev; pd->uobject = uobj; + pd->local_mr = NULL; atomic_set(&pd->usecnt, 0); uobj->object = pd; @@ -600,11 +597,13 @@ err: } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; + struct ib_pd *pd; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -613,15 +612,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); if (!uobj) return -EINVAL; + pd = uobj->object; - ret = ib_dealloc_pd(uobj->object); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); + if (atomic_read(&pd->usecnt)) { + ret = -EBUSY; + goto err_put; + } + ret = pd->device->dealloc_pd(uobj->object); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); if (ret) - return ret; + goto err_put; + + uobj->live = 0; + put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_pd_idr, uobj); @@ -632,6 +636,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, put_uobj(uobj); return in_len; + +err_put: + put_uobj_write(uobj); + return ret; } struct xrcd_table_entry { @@ -720,6 +728,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev, } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -778,15 +787,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, down_write(&obj->uobject.mutex); if (!xrcd) { - xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, - file->ucontext, &udata); + xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } xrcd->inode = inode; - xrcd->device = file->device->ib_dev; + xrcd->device = ib_dev; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); @@ -857,6 +865,7 @@ err_tree_mutex_unlock: } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -934,6 +943,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1043,6 +1053,7 @@ err_free: } ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1136,6 +1147,7 @@ put_uobjs: } ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1174,8 +1186,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; @@ -1256,8 +1269,9 @@ err_free: } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; @@ -1294,6 +1308,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1313,7 +1328,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, return ret; resp.fd = ret; - filp = ib_uverbs_alloc_event_file(file, 0); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 0); if (IS_ERR(filp)) { put_unused_fd(resp.fd); return PTR_ERR(filp); @@ -1330,40 +1345,38 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, return in_len; } -ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_cq *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *udata, + void *context), + void *context) { - struct ib_uverbs_create_cq cmd; - struct ib_uverbs_create_cq_resp resp; - struct ib_udata udata; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; int ret; + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_cq_init_attr attr = {}; - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - - if (cmd.comp_vector >= file->device->num_comp_vectors) - return -EINVAL; + if (cmd->comp_vector >= file->device->num_comp_vectors) + return ERR_PTR(-EINVAL); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class); + init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); down_write(&obj->uobject.mutex); - if (cmd.comp_channel >= 0) { - ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel); + if (cmd->comp_channel >= 0) { + ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); if (!ev_file) { ret = -EINVAL; goto err; @@ -1376,15 +1389,20 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe, - cmd.comp_vector, - file->ucontext, &udata); + attr.cqe = cmd->cqe; + attr.comp_vector = cmd->comp_vector; + + if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) + attr.flags = cmd->flags; + + cq = ib_dev->create_cq(ib_dev, &attr, + file->ucontext, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; } - cq->device = file->device->ib_dev; + cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; @@ -1397,14 +1415,15 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, goto err_free; memset(&resp, 0, sizeof resp); - resp.cq_handle = obj->uobject.id; - resp.cqe = cq->cqe; + resp.base.cq_handle = obj->uobject.id; + resp.base.cqe = cq->cqe; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; - } + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + + ret = cb(file, obj, &resp, ucore, context); + if (ret) + goto err_cb; mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); @@ -1414,9 +1433,9 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, up_write(&obj->uobject.mutex); - return in_len; + return obj; -err_copy: +err_cb: idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); err_free: @@ -1428,10 +1447,112 @@ err_file: err: put_uobj_write(&obj->uobject); - return ret; + + return ERR_PTR(ret); +} + +static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; +} + +ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_cq cmd; + struct ib_uverbs_ex_create_cq cmd_ex; + struct ib_uverbs_create_cq_resp resp; + struct ib_udata ucore; + struct ib_udata uhw; + struct ib_ucq_object *obj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), sizeof(resp)); + + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.cqe = cmd.cqe; + cmd_ex.comp_vector = cmd.comp_vector; + cmd_ex.comp_channel = cmd.comp_channel; + + obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), comp_channel) + + sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, + NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return in_len; +} + +static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_uverbs_ex_create_cq cmd; + struct ib_ucq_object *obj; + int err; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + obj = create_cq(file, ib_dev, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_cq_cb, NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return 0; } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1495,6 +1616,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1546,6 +1668,7 @@ out_put: } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1568,6 +1691,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1619,65 +1743,65 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, return in_len; } -ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_create_qp cmd; - struct ib_uverbs_create_qp_resp resp; - struct ib_udata udata; - struct ib_uqp_object *obj; - struct ib_device *device; - struct ib_pd *pd = NULL; - struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_cq *scq = NULL, *rcq = NULL; - struct ib_srq *srq = NULL; - struct ib_qp *qp; - struct ib_qp_init_attr attr; - int ret; - - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; +static int create_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_qp *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *udata), + void *context) +{ + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + char *buf; + struct ib_qp_init_attr attr; + struct ib_uverbs_ex_create_qp_resp resp; + int ret; - if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); - obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, + &qp_lock_class); down_write(&obj->uevent.uobject.mutex); - if (cmd.qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (cmd->qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, + &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { - if (cmd.qp_type == IB_QPT_XRC_INI) { - cmd.max_recv_wr = cmd.max_recv_sge = 0; + if (cmd->qp_type == IB_QPT_XRC_INI) { + cmd->max_recv_wr = 0; + cmd->max_recv_sge = 0; } else { - if (cmd.is_srq) { - srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (cmd->is_srq) { + srq = idr_read_srq(cmd->srq_handle, + file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; } } - if (cmd.recv_cq_handle != cmd.send_cq_handle) { - rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0); + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, + file->ucontext, 0); if (!rcq) { ret = -EINVAL; goto err_put; @@ -1685,9 +1809,9 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, } } - scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq); + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); rcq = rcq ?: scq; - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd || !scq) { ret = -EINVAL; goto err_put; @@ -1702,31 +1826,49 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; - attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd.qp_type; + attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; + attr.qp_type = cmd->qp_type; attr.create_flags = 0; - attr.cap.max_send_wr = cmd.max_send_wr; - attr.cap.max_recv_wr = cmd.max_recv_wr; - attr.cap.max_send_sge = cmd.max_send_sge; - attr.cap.max_recv_sge = cmd.max_recv_sge; - attr.cap.max_inline_data = cmd.max_inline_data; + attr.cap.max_send_wr = cmd->max_send_wr; + attr.cap.max_recv_wr = cmd->max_recv_wr; + attr.cap.max_send_sge = cmd->max_send_sge; + attr.cap.max_recv_sge = cmd->max_recv_sge; + attr.cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); - if (cmd.qp_type == IB_QPT_XRC_TGT) + if (cmd_sz >= offsetof(typeof(*cmd), create_flags) + + sizeof(cmd->create_flags)) + attr.create_flags = cmd->create_flags; + + if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + ret = -EINVAL; + goto err_put; + } + + buf = (void *)cmd + sizeof(*cmd); + if (cmd_sz > sizeof(*cmd)) + if (!(buf[0] == 0 && !memcmp(buf, buf + 1, + cmd_sz - sizeof(*cmd) - 1))) { + ret = -EINVAL; + goto err_put; + } + + if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, &udata); + qp = device->create_qp(pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - if (cmd.qp_type != IB_QPT_XRC_TGT) { + if (cmd->qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; @@ -1752,19 +1894,20 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, goto err_destroy; memset(&resp, 0, sizeof resp); - resp.qpn = qp->qp_num; - resp.qp_handle = obj->uevent.uobject.id; - resp.max_recv_sge = attr.cap.max_recv_sge; - resp.max_send_sge = attr.cap.max_send_sge; - resp.max_recv_wr = attr.cap.max_recv_wr; - resp.max_send_wr = attr.cap.max_send_wr; - resp.max_inline_data = attr.cap.max_inline_data; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; - } + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; + + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + + ret = cb(file, &resp, ucore); + if (ret) + goto err_cb; if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, @@ -1790,9 +1933,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, up_write(&obj->uevent.uobject.mutex); - return in_len; - -err_copy: + return 0; +err_cb: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: @@ -1814,7 +1956,115 @@ err_put: return ret; } +static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; +} + +ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_qp cmd; + struct ib_uverbs_ex_create_qp cmd_ex; + struct ib_udata ucore; + struct ib_udata uhw; + ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp); + int err; + + if (out_len < resp_size) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), + resp_size); + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + resp_size, + in_len - sizeof(cmd), out_len - resp_size); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.pd_handle = cmd.pd_handle; + cmd_ex.send_cq_handle = cmd.send_cq_handle; + cmd_ex.recv_cq_handle = cmd.recv_cq_handle; + cmd_ex.srq_handle = cmd.srq_handle; + cmd_ex.max_send_wr = cmd.max_send_wr; + cmd_ex.max_recv_wr = cmd.max_recv_wr; + cmd_ex.max_send_sge = cmd.max_send_sge; + cmd_ex.max_recv_sge = cmd.max_recv_sge; + cmd_ex.max_inline_data = cmd.max_inline_data; + cmd_ex.sq_sig_all = cmd.sq_sig_all; + cmd_ex.qp_type = cmd.qp_type; + cmd_ex.is_srq = cmd.is_srq; + + err = create_qp(file, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), is_srq) + + sizeof(cmd.is_srq), ib_uverbs_create_qp_cb, + NULL); + + if (err) + return err; + + return in_len; +} + +static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_qp_resp resp; + struct ib_uverbs_ex_create_qp cmd = {0}; + int err; + + if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) + + sizeof(cmd.comp_mask))) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + err = create_qp(file, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_qp_cb, NULL); + + if (err) + return err; + + return 0; +} + ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; @@ -1909,6 +2159,7 @@ err_put: } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2023,6 +2274,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) } ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2095,7 +2347,7 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; if (qp->real_qp == qp) { - ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask); + ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask); if (ret) goto release_qp; ret = qp->device->modify_qp(qp, attr, @@ -2119,6 +2371,7 @@ out: } ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2176,7 +2429,14 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, return in_len; } +static void *alloc_wr(size_t wr_size, __u32 num_sge) +{ + return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + + num_sge * sizeof (struct ib_sge), GFP_KERNEL); +}; + ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2188,6 +2448,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; + size_t next_size; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2223,14 +2484,87 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, goto out_put; } - next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + - user_wr->num_sge * sizeof (struct ib_sge), - GFP_KERNEL); - if (!next) { - ret = -ENOMEM; + if (is_ud) { + struct ib_ud_wr *ud; + + if (user_wr->opcode != IB_WR_SEND && + user_wr->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + + next_size = sizeof(*ud); + ud = alloc_wr(next_size, user_wr->num_sge); + if (!ud) { + ret = -ENOMEM; + goto out_put; + } + + ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); + if (!ud->ah) { + kfree(ud); + ret = -EINVAL; + goto out_put; + } + ud->remote_qpn = user_wr->wr.ud.remote_qpn; + ud->remote_qkey = user_wr->wr.ud.remote_qkey; + + next = &ud->wr; + } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE || + user_wr->opcode == IB_WR_RDMA_READ) { + struct ib_rdma_wr *rdma; + + next_size = sizeof(*rdma); + rdma = alloc_wr(next_size, user_wr->num_sge); + if (!rdma) { + ret = -ENOMEM; + goto out_put; + } + + rdma->remote_addr = user_wr->wr.rdma.remote_addr; + rdma->rkey = user_wr->wr.rdma.rkey; + + next = &rdma->wr; + } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct ib_atomic_wr *atomic; + + next_size = sizeof(*atomic); + atomic = alloc_wr(next_size, user_wr->num_sge); + if (!atomic) { + ret = -ENOMEM; + goto out_put; + } + + atomic->remote_addr = user_wr->wr.atomic.remote_addr; + atomic->compare_add = user_wr->wr.atomic.compare_add; + atomic->swap = user_wr->wr.atomic.swap; + atomic->rkey = user_wr->wr.atomic.rkey; + + next = &atomic->wr; + } else if (user_wr->opcode == IB_WR_SEND || + user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_SEND_WITH_INV) { + next_size = sizeof(*next); + next = alloc_wr(next_size, user_wr->num_sge); + if (!next) { + ret = -ENOMEM; + goto out_put; + } + } else { + ret = -EINVAL; goto out_put; } + if (user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) { + next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; + } + if (!last) wr = next; else @@ -2243,63 +2577,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; - if (is_ud) { - if (next->opcode != IB_WR_SEND && - next->opcode != IB_WR_SEND_WITH_IMM) { - ret = -EINVAL; - goto out_put; - } - - next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, - file->ucontext); - if (!next->wr.ud.ah) { - ret = -EINVAL; - goto out_put; - } - next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; - next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; - if (next->opcode == IB_WR_SEND_WITH_IMM) - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - } else { - switch (next->opcode) { - case IB_WR_RDMA_WRITE_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - next->wr.rdma.remote_addr = - user_wr->wr.rdma.remote_addr; - next->wr.rdma.rkey = - user_wr->wr.rdma.rkey; - break; - case IB_WR_SEND_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - break; - case IB_WR_SEND_WITH_INV: - next->ex.invalidate_rkey = - user_wr->ex.invalidate_rkey; - break; - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - next->wr.atomic.remote_addr = - user_wr->wr.atomic.remote_addr; - next->wr.atomic.compare_add = - user_wr->wr.atomic.compare_add; - next->wr.atomic.swap = user_wr->wr.atomic.swap; - next->wr.atomic.rkey = user_wr->wr.atomic.rkey; - case IB_WR_SEND: - break; - default: - ret = -EINVAL; - goto out_put; - } - } - if (next->num_sge) { next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + ALIGN(next_size, sizeof(struct ib_sge)); if (copy_from_user(next->sg_list, buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + @@ -2330,8 +2610,8 @@ out_put: put_qp_read(qp); while (wr) { - if (is_ud && wr->wr.ud.ah) - put_ah_read(wr->wr.ud.ah); + if (is_ud && ud_wr(wr)->ah) + put_ah_read(ud_wr(wr)->ah); next = wr->next; kfree(wr); wr = next; @@ -2429,6 +2709,7 @@ err: } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2478,6 +2759,7 @@ out: } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2527,6 +2809,7 @@ out: } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2567,7 +2850,6 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; - attr.vlan_id = 0; memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); @@ -2619,6 +2901,7 @@ err: } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; @@ -2655,6 +2938,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2702,6 +2986,7 @@ out_put: } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2782,6 +3067,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -2942,6 +3228,7 @@ err_free_attr: } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -2984,6 +3271,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, } static int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { @@ -3117,6 +3405,7 @@ err: } ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3144,7 +3433,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - ret = __uverbs_create_xsrq(file, &xcmd, &udata); + ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) return ret; @@ -3152,6 +3441,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_xsrq cmd; @@ -3169,7 +3459,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - ret = __uverbs_create_xsrq(file, &cmd, &udata); + ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) return ret; @@ -3177,6 +3467,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3207,6 +3498,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3247,6 +3539,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3304,16 +3597,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, } int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_query_device_resp resp; struct ib_uverbs_ex_query_device cmd; struct ib_device_attr attr; - struct ib_device *device; int err; - device = file->device->ib_dev; if (ucore->inlen < sizeof(cmd)) return -EINVAL; @@ -3332,11 +3624,13 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, if (ucore->outlen < resp.response_length) return -ENOSPC; - err = device->query_device(device, &attr); + memset(&attr, 0, sizeof(attr)); + + err = ib_dev->query_device(ib_dev, &attr, uhw); if (err) return err; - copy_query_dev_fields(file, &resp.base, &attr); + copy_query_dev_fields(file, ib_dev, &resp.base, &attr); resp.comp_mask = 0; if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) @@ -3356,6 +3650,18 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, #endif resp.response_length += sizeof(resp.odp_caps); + if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask)) + goto end; + + resp.timestamp_mask = attr.timestamp_mask; + resp.response_length += sizeof(resp.timestamp_mask); + + if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock)) + goto end; + + resp.hca_core_clock = attr.hca_core_clock; + resp.response_length += sizeof(resp.hca_core_clock); + end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); if (err) diff --git a/kernel/drivers/infiniband/core/uverbs_main.c b/kernel/drivers/infiniband/core/uverbs_main.c index 09686d49d..e3ef28861 100644 --- a/kernel/drivers/infiniband/core/uverbs_main.c +++ b/kernel/drivers/infiniband/core/uverbs_main.c @@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) = { [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, @@ -119,21 +120,25 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, }; static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device, + [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq, + [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp, }; static void ib_uverbs_add_one(struct ib_device *device); -static void ib_uverbs_remove_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = container_of(kobj, struct ib_uverbs_device, kobj); + cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } @@ -204,9 +209,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, { struct ib_uobject *uobj, *tmp; - if (!context) - return 0; - context->closing = 1; list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { @@ -315,8 +317,16 @@ static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); + struct ib_device *ib_dev; + int srcu_key; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (ib_dev && !ib_dev->disassociate_ucontext) + module_put(ib_dev->owner); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); - module_put(file->device->ib_dev->owner); if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); @@ -340,9 +350,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf, return -EAGAIN; if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->event_list))) + (!list_empty(&file->event_list) || + /* The barriers built into wait_event_interruptible() + * and wake_up() guarentee this will see the null set + * without using RCU + */ + !file->uverbs_file->device->ib_dev))) return -ERESTARTSYS; + /* If device was disassociated and no event exists set an error */ + if (list_empty(&file->event_list) && + !file->uverbs_file->device->ib_dev) + return -EIO; + spin_lock_irq(&file->lock); } @@ -405,8 +425,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp) { struct ib_uverbs_event_file *file = filp->private_data; struct ib_uverbs_event *entry, *tmp; + int closed_already = 0; + mutex_lock(&file->uverbs_file->device->lists_mutex); spin_lock_irq(&file->lock); + closed_already = file->is_closed; file->is_closed = 1; list_for_each_entry_safe(entry, tmp, &file->event_list, list) { if (entry->counter) @@ -414,11 +437,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp) kfree(entry); } spin_unlock_irq(&file->lock); - - if (file->is_async) { - ib_unregister_event_handler(&file->uverbs_file->event_handler); - kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + if (!closed_already) { + list_del(&file->list); + if (file->is_async) + ib_unregister_event_handler(&file->uverbs_file-> + event_handler); } + mutex_unlock(&file->uverbs_file->device->lists_mutex); + + kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&file->ref, ib_uverbs_release_event_file); return 0; @@ -550,13 +577,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, NULL, NULL); } +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) +{ + kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + file->async_file = NULL; +} + struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async) { struct ib_uverbs_event_file *ev_file; struct file *filp; + int ret; - ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL); + ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) return ERR_PTR(-ENOMEM); @@ -565,16 +600,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, INIT_LIST_HEAD(&ev_file->event_list); init_waitqueue_head(&ev_file->poll_wait); ev_file->uverbs_file = uverbs_file; + kref_get(&ev_file->uverbs_file->ref); ev_file->async_queue = NULL; - ev_file->is_async = is_async; ev_file->is_closed = 0; filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops, ev_file, O_RDONLY); if (IS_ERR(filp)) - kfree(ev_file); + goto err_put_refs; + + mutex_lock(&uverbs_file->device->lists_mutex); + list_add_tail(&ev_file->list, + &uverbs_file->device->uverbs_events_file_list); + mutex_unlock(&uverbs_file->device->lists_mutex); + + if (is_async) { + WARN_ON(uverbs_file->async_file); + uverbs_file->async_file = ev_file; + kref_get(&uverbs_file->async_file->ref); + INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, + ib_dev, + ib_uverbs_event_handler); + ret = ib_register_event_handler(&uverbs_file->event_handler); + if (ret) + goto err_put_file; + + /* At that point async file stuff was fully set */ + ev_file->is_async = 1; + } return filp; + +err_put_file: + fput(filp); + kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file); + uverbs_file->async_file = NULL; + return ERR_PTR(ret); + +err_put_refs: + kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); + kref_put(&ev_file->ref, ib_uverbs_release_event_file); + return filp; } /* @@ -610,8 +676,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; + struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; __u32 flags; + int srcu_key; + ssize_t ret; if (count < sizeof hdr) return -EINVAL; @@ -619,6 +688,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; + } + flags = (hdr.command & IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; @@ -626,26 +703,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, __u32 command; if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; if (command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[command]) - return -EINVAL; + !uverbs_cmd_table[command]) { + ret = -EINVAL; + goto out; + } if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + ret = -EINVAL; + goto out; + } - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) - return -ENOSYS; + if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) { + ret = -ENOSYS; + goto out; + } - if (hdr.in_words * 4 != count) - return -EINVAL; + if (hdr.in_words * 4 != count) { + ret = -EINVAL; + goto out; + } - return uverbs_cmd_table[command](file, + ret = uverbs_cmd_table[command](file, ib_dev, buf + sizeof(hdr), hdr.in_words * 4, hdr.out_words * 4); @@ -656,51 +743,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; - int err; size_t written_count = count; if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || - !uverbs_ex_cmd_table[command]) - return -ENOSYS; + !uverbs_ex_cmd_table[command]) { + ret = -ENOSYS; + goto out; + } - if (!file->ucontext) - return -EINVAL; + if (!file->ucontext) { + ret = -EINVAL; + goto out; + } - if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) - return -ENOSYS; + if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) { + ret = -ENOSYS; + goto out; + } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; + if (count < (sizeof(hdr) + sizeof(ex_hdr))) { + ret = -EINVAL; + goto out; + } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { + ret = -EFAULT; + goto out; + } count -= sizeof(hdr) + sizeof(ex_hdr); buf += sizeof(hdr) + sizeof(ex_hdr); - if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { + ret = -EINVAL; + goto out; + } - if (ex_hdr.cmd_hdr_reserved) - return -EINVAL; + if (ex_hdr.cmd_hdr_reserved) { + ret = -EINVAL; + goto out; + } if (ex_hdr.response) { - if (!hdr.out_words && !ex_hdr.provider_out_words) - return -EINVAL; + if (!hdr.out_words && !ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } if (!access_ok(VERIFY_WRITE, (void __user *) (unsigned long) ex_hdr.response, - (hdr.out_words + ex_hdr.provider_out_words) * 8)) - return -EFAULT; + (hdr.out_words + ex_hdr.provider_out_words) * 8)) { + ret = -EFAULT; + goto out; + } } else { - if (hdr.out_words || ex_hdr.provider_out_words) - return -EINVAL; + if (hdr.out_words || ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } } INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, @@ -712,27 +820,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, ex_hdr.provider_in_words * 8, ex_hdr.provider_out_words * 8); - err = uverbs_ex_cmd_table[command](file, + ret = uverbs_ex_cmd_table[command](file, + ib_dev, &ucore, &uhw); - - if (err) - return err; - - return written_count; + if (!ret) + ret = written_count; + } else { + ret = -ENOSYS; } - return -ENOSYS; +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; + struct ib_device *ib_dev; + int ret = 0; + int srcu_key; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; + } if (!file->ucontext) - return -ENODEV; + ret = -ENODEV; else - return file->device->ib_dev->mmap(file->ucontext, vma); + ret = ib_dev->mmap(file->ucontext, vma); +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } /* @@ -749,21 +873,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) { struct ib_uverbs_device *dev; struct ib_uverbs_file *file; + struct ib_device *ib_dev; int ret; + int module_dependent; + int srcu_key; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; - if (!try_module_get(dev->ib_dev->owner)) { - ret = -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + mutex_lock(&dev->lists_mutex); + ib_dev = srcu_dereference(dev->ib_dev, + &dev->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; goto err; } - file = kmalloc(sizeof *file, GFP_KERNEL); + /* In case IB device supports disassociate ucontext, there is no hard + * dependency between uverbs device and its low level device. + */ + module_dependent = !(ib_dev->disassociate_ucontext); + + if (module_dependent) { + if (!try_module_get(ib_dev->owner)) { + ret = -ENODEV; + goto err; + } + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; - goto err_module; + if (module_dependent) + goto err_module; + + goto err; } file->device = dev; @@ -774,13 +920,18 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) filp->private_data = file; kobject_get(&dev->kobj); + list_add_tail(&file->list, &dev->uverbs_file_list); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return nonseekable_open(inode, filp); err_module: - module_put(dev->ib_dev->owner); + module_put(ib_dev->owner); err: + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); if (atomic_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); @@ -791,8 +942,18 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_device *dev = file->device; - - ib_uverbs_cleanup_ucontext(file, file->ucontext); + struct ib_ucontext *ucontext = NULL; + + mutex_lock(&file->device->lists_mutex); + ucontext = file->ucontext; + file->ucontext = NULL; + if (!file->is_closed) { + list_del(&file->list); + file->is_closed = 1; + } + mutex_unlock(&file->device->lists_mutex); + if (ucontext) + ib_uverbs_cleanup_ucontext(file, ucontext); if (file->async_file) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); @@ -829,12 +990,21 @@ static struct ib_client uverbs_client = { static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, char *buf) { + int ret = -ENODEV; + int srcu_key; struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_device *ib_dev; if (!dev) return -ENODEV; - return sprintf(buf, "%s\n", dev->ib_dev->name); + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%s\n", ib_dev->name); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + + return ret; } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -842,11 +1012,19 @@ static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = dev_get_drvdata(device); + int ret = -ENODEV; + int srcu_key; + struct ib_device *ib_dev; if (!dev) return -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); + return ret; } static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); @@ -886,6 +1064,7 @@ static void ib_uverbs_add_one(struct ib_device *device) int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; + int ret; if (!device->alloc_ucontext) return; @@ -894,11 +1073,20 @@ static void ib_uverbs_add_one(struct ib_device *device) if (!uverbs_dev) return; + ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); + if (ret) { + kfree(uverbs_dev); + return; + } + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); + mutex_init(&uverbs_dev->lists_mutex); + INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); + INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -919,7 +1107,7 @@ static void ib_uverbs_add_one(struct ib_device *device) } spin_unlock(&map_lock); - uverbs_dev->ib_dev = device; + rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; cdev_init(&uverbs_dev->cdev, NULL); @@ -963,9 +1151,72 @@ err: return; } -static void ib_uverbs_remove_one(struct ib_device *device) +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, + struct ib_device *ib_dev) { - struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_file *file; + struct ib_uverbs_event_file *event_file; + struct ib_event event; + + /* Pending running commands to terminate */ + synchronize_srcu(&uverbs_dev->disassociate_srcu); + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + event.device = ib_dev; + + mutex_lock(&uverbs_dev->lists_mutex); + while (!list_empty(&uverbs_dev->uverbs_file_list)) { + struct ib_ucontext *ucontext; + + file = list_first_entry(&uverbs_dev->uverbs_file_list, + struct ib_uverbs_file, list); + file->is_closed = 1; + ucontext = file->ucontext; + list_del(&file->list); + file->ucontext = NULL; + kref_get(&file->ref); + mutex_unlock(&uverbs_dev->lists_mutex); + /* We must release the mutex before going ahead and calling + * disassociate_ucontext. disassociate_ucontext might end up + * indirectly calling uverbs_close, for example due to freeing + * the resources (e.g mmput). + */ + ib_uverbs_event_handler(&file->event_handler, &event); + if (ucontext) { + ib_dev->disassociate_ucontext(ucontext); + ib_uverbs_cleanup_ucontext(file, ucontext); + } + + mutex_lock(&uverbs_dev->lists_mutex); + kref_put(&file->ref, ib_uverbs_release_file); + } + + while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { + event_file = list_first_entry(&uverbs_dev-> + uverbs_events_file_list, + struct ib_uverbs_event_file, + list); + spin_lock_irq(&event_file->lock); + event_file->is_closed = 1; + spin_unlock_irq(&event_file->lock); + + list_del(&event_file->list); + if (event_file->is_async) { + ib_unregister_event_handler(&event_file->uverbs_file-> + event_handler); + event_file->uverbs_file->event_handler.device = NULL; + } + + wake_up_interruptible(&event_file->poll_wait); + kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); + } + mutex_unlock(&uverbs_dev->lists_mutex); +} + +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int wait_clients = 1; if (!uverbs_dev) return; @@ -979,9 +1230,27 @@ static void ib_uverbs_remove_one(struct ib_device *device) else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + if (device->disassociate_ucontext) { + /* We disassociate HW resources and immediately return. + * Userspace will see a EIO errno for all future access. + * Upon returning, ib_device may be freed internally and is not + * valid any more. + * uverbs_device is still available until all clients close + * their files, then the uverbs device ref count will be zero + * and its resources will be freed. + * Note: At this point no more files can be opened since the + * cdev was deleted, however active clients can still issue + * commands and close their open files. + */ + rcu_assign_pointer(uverbs_dev->ib_dev, NULL); + ib_uverbs_free_hw_resources(uverbs_dev, device); + wait_clients = 0; + } + if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); - wait_for_completion(&uverbs_dev->comp); + if (wait_clients) + wait_for_completion(&uverbs_dev->comp); kobject_put(&uverbs_dev->kobj); } diff --git a/kernel/drivers/infiniband/core/uverbs_marshall.c b/kernel/drivers/infiniband/core/uverbs_marshall.c index abd972474..7d2f14c9b 100644 --- a/kernel/drivers/infiniband/core/uverbs_marshall.c +++ b/kernel/drivers/infiniband/core/uverbs_marshall.c @@ -141,8 +141,8 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; - memset(dst->smac, 0, sizeof(dst->smac)); memset(dst->dmac, 0, sizeof(dst->dmac)); - dst->vlan_id = 0xffff; + dst->net = NULL; + dst->ifindex = 0; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/kernel/drivers/infiniband/core/verbs.c b/kernel/drivers/infiniband/core/verbs.c index f93eb8da7..545906dec 100644 --- a/kernel/drivers/infiniband/core/verbs.c +++ b/kernel/drivers/infiniband/core/verbs.c @@ -41,6 +41,9 @@ #include <linux/export.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -48,6 +51,71 @@ #include "core_priv.h" +static const char * const ib_events[] = { + [IB_EVENT_CQ_ERR] = "CQ error", + [IB_EVENT_QP_FATAL] = "QP fatal error", + [IB_EVENT_QP_REQ_ERR] = "QP request error", + [IB_EVENT_QP_ACCESS_ERR] = "QP access error", + [IB_EVENT_COMM_EST] = "communication established", + [IB_EVENT_SQ_DRAINED] = "send queue drained", + [IB_EVENT_PATH_MIG] = "path migration successful", + [IB_EVENT_PATH_MIG_ERR] = "path migration error", + [IB_EVENT_DEVICE_FATAL] = "device fatal error", + [IB_EVENT_PORT_ACTIVE] = "port active", + [IB_EVENT_PORT_ERR] = "port error", + [IB_EVENT_LID_CHANGE] = "LID change", + [IB_EVENT_PKEY_CHANGE] = "P_key change", + [IB_EVENT_SM_CHANGE] = "SM change", + [IB_EVENT_SRQ_ERR] = "SRQ error", + [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IB_EVENT_CLIENT_REREGISTER] = "client reregister", + [IB_EVENT_GID_CHANGE] = "GID changed", +}; + +const char *__attribute_const__ ib_event_msg(enum ib_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? + ib_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(ib_event_msg); + +static const char * const wc_statuses[] = { + [IB_WC_SUCCESS] = "success", + [IB_WC_LOC_LEN_ERR] = "local length error", + [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IB_WC_LOC_PROT_ERR] = "local protection error", + [IB_WC_WR_FLUSH_ERR] = "WR flushed", + [IB_WC_MW_BIND_ERR] = "memory management operation error", + [IB_WC_BAD_RESP_ERR] = "bad response error", + [IB_WC_LOC_ACCESS_ERR] = "local access error", + [IB_WC_REM_INV_REQ_ERR] = "invalid request error", + [IB_WC_REM_ACCESS_ERR] = "remote access error", + [IB_WC_REM_OP_ERR] = "remote operation error", + [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IB_WC_REM_ABORT_ERR] = "operation aborted", + [IB_WC_INV_EECN_ERR] = "invalid EE context number", + [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IB_WC_FATAL_ERR] = "fatal error", + [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IB_WC_GENERAL_ERR] = "general error", +}; + +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) +{ + size_t index = status; + + return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? + wc_statuses[index] : "unrecognized status"; +} +EXPORT_SYMBOL(ib_wc_status_msg); + __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { @@ -148,28 +216,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + * + * Every PD has a local_dma_lkey which can be used as the lkey value for local + * memory operations. + */ struct ib_pd *ib_alloc_pd(struct ib_device *device) { struct ib_pd *pd; + struct ib_device_attr devattr; + int rc; + + rc = ib_query_device(device, &devattr); + if (rc) + return ERR_PTR(rc); pd = device->alloc_pd(device, NULL, NULL); + if (IS_ERR(pd)) + return pd; + + pd->device = device; + pd->uobject = NULL; + pd->local_mr = NULL; + atomic_set(&pd->usecnt, 0); + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + pd->local_dma_lkey = device->local_dma_lkey; + else { + struct ib_mr *mr; + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + ib_dealloc_pd(pd); + return (struct ib_pd *)mr; + } - if (!IS_ERR(pd)) { - pd->device = device; - pd->uobject = NULL; - atomic_set(&pd->usecnt, 0); + pd->local_mr = mr; + pd->local_dma_lkey = pd->local_mr->lkey; } - return pd; } EXPORT_SYMBOL(ib_alloc_pd); -int ib_dealloc_pd(struct ib_pd *pd) +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + * + * It is an error to call this function while any resources in the pd still + * exist. The caller is responsible to synchronously destroy them and + * guarantee no new allocations will happen. + */ +void ib_dealloc_pd(struct ib_pd *pd) { - if (atomic_read(&pd->usecnt)) - return -EBUSY; + int ret; - return pd->device->dealloc_pd(pd); + if (pd->local_mr) { + ret = ib_dereg_mr(pd->local_mr); + WARN_ON(ret); + pd->local_mr = NULL; + } + + /* uverbs manipulates usecnt with proper locking, while the kabi + requires the caller to guarantee we can't race here. */ + WARN_ON(atomic_read(&pd->usecnt)); + + /* Making delalloc_pd a void return is a WIP, no driver should return + an error here. */ + ret = pd->device->dealloc_pd(pd); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); } EXPORT_SYMBOL(ib_dealloc_pd); @@ -192,32 +311,69 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) } EXPORT_SYMBOL(ib_create_ah); -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr) +struct find_gid_index_context { + u16 vlan_id; +}; + +static bool find_gid_index(const union ib_gid *gid, + const struct ib_gid_attr *gid_attr, + void *context) +{ + struct find_gid_index_context *ctx = + (struct find_gid_index_context *)context; + + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || + (is_vlan_dev(gid_attr->ndev) && + vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) + return false; + + return true; +} + +static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, + u16 vlan_id, const union ib_gid *sgid, + u16 *gid_index) +{ + struct find_gid_index_context context = {.vlan_id = vlan_id}; + + return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context, gid_index); +} + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr) { u32 flow_class; u16 gid_index; int ret; - int is_eth = (rdma_port_get_link_layer(device, port_num) == - IB_LINK_LAYER_ETHERNET); memset(ah_attr, 0, sizeof *ah_attr); - if (is_eth) { + if (rdma_cap_eth_ah(device, port_num)) { + u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? + wc->vlan_id : 0xffff; + if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (wc->wc_flags & IB_WC_WITH_SMAC && - wc->wc_flags & IB_WC_WITH_VLAN) { - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); - ah_attr->vlan_id = wc->vlan_id; - } else { + if (!(wc->wc_flags & IB_WC_WITH_SMAC) || + !(wc->wc_flags & IB_WC_WITH_VLAN)) { ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, &ah_attr->vlan_id); + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + 0); if (ret) return ret; } - } else { - ah_attr->vlan_id = 0xffff; + + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &grh->dgid, &gid_index); + if (ret) + return ret; + + if (wc->wc_flags & IB_WC_WITH_SMAC) + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); } ah_attr->dlid = wc->slid; @@ -229,10 +385,13 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = grh->sgid; - ret = ib_find_cached_gid(device, &grh->dgid, &port_num, - &gid_index); - if (ret) - return ret; + if (!rdma_cap_eth_ah(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &grh->dgid, + port_num, NULL, + &gid_index); + if (ret) + return ret; + } ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); @@ -244,8 +403,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, } EXPORT_SYMBOL(ib_init_ah_from_wc); -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num) +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num) { struct ib_ah_attr ah_attr; int ret; @@ -502,9 +661,7 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; - enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -585,12 +742,6 @@ static const struct { IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, - .req_param_add_eth = { - [IB_QPT_RC] = (IB_QP_SMAC), - [IB_QPT_UC] = (IB_QP_SMAC), - [IB_QPT_XRC_INI] = (IB_QP_SMAC), - [IB_QPT_XRC_TGT] = (IB_QP_SMAC) - }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), @@ -611,21 +762,7 @@ static const struct { [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), }, - .opt_param_add_eth = { - [IB_QPT_RC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_UC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID) - } - } + }, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -847,13 +984,6 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; - if (ll == IB_LINK_LAYER_ETHERNET) { - req_param |= qp_state_table[cur_state][next_state]. - req_param_add_eth[type]; - opt_param |= qp_state_table[cur_state][next_state]. - opt_param_add_eth[type]; - } - if ((mask & req_param) != req_param) return 0; @@ -864,40 +994,52 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, } EXPORT_SYMBOL(ib_modify_qp_is_ok); -int ib_resolve_eth_l2_attrs(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, int *qp_attr_mask) +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) { int ret = 0; - union ib_gid sgid; - if ((*qp_attr_mask & IB_QP_AV) && - (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) { - ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, - qp_attr->ah_attr.grh.sgid_index, &sgid); - if (ret) - goto out; + if (*qp_attr_mask & IB_QP_AV) { + if (qp_attr->ah_attr.port_num < rdma_start_port(qp->device) || + qp_attr->ah_attr.port_num > rdma_end_port(qp->device)) + return -EINVAL; + + if (!rdma_cap_eth_ah(qp->device, qp_attr->ah_attr.port_num)) + return 0; + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { - rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); - rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); - if (!(*qp_attr_mask & IB_QP_VID)) - qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, + qp_attr->ah_attr.dmac); } else { - ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, - qp_attr->ah_attr.dmac, &qp_attr->vlan_id); - if (ret) - goto out; - ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL); - if (ret) + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + int ifindex; + + ret = ib_query_gid(qp->device, + qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, + &sgid, &sgid_attr); + + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; goto out; + } + + ifindex = sgid_attr.ndev->ifindex; + + ret = rdma_addr_find_dmac_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, ifindex); + + dev_put(sgid_attr.ndev); } - *qp_attr_mask |= IB_QP_SMAC; - if (qp_attr->vlan_id < 0xFFFF) - *qp_attr_mask |= IB_QP_VID; } out: return ret; } -EXPORT_SYMBOL(ib_resolve_eth_l2_attrs); +EXPORT_SYMBOL(ib_resolve_eth_dmac); int ib_modify_qp(struct ib_qp *qp, @@ -906,7 +1048,7 @@ int ib_modify_qp(struct ib_qp *qp, { int ret; - ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask); + ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask); if (ret) return ret; @@ -1012,11 +1154,12 @@ EXPORT_SYMBOL(ib_destroy_qp); struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector) + void *cq_context, + const struct ib_cq_init_attr *cq_attr) { struct ib_cq *cq; - cq = device->create_cq(device, cqe, comp_vector, NULL, NULL); + cq = device->create_cq(device, cq_attr, NULL, NULL); if (!IS_ERR(cq)) { cq->device = device; @@ -1079,73 +1222,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) } EXPORT_SYMBOL(ib_get_dma_mr); -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - if (!pd->device->reg_phys_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_reg_phys_mr); - -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_pd *old_pd; - int ret; - - ret = ib_check_mr_access(mr_access_flags); - if (ret) - return ret; - - if (!mr->device->rereg_phys_mr) - return -ENOSYS; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - old_pd = mr->pd; - - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, - phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { - atomic_dec(&old_pd->usecnt); - atomic_inc(&pd->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_rereg_phys_mr); - int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) { return mr->device->query_mr ? @@ -1170,54 +1246,28 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr) -{ - struct ib_mr *mr; - - if (!pd->device->create_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->create_mr(pd, mr_init_attr); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_create_mr); - -int ib_destroy_mr(struct ib_mr *mr) -{ - struct ib_pd *pd; - int ret; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; - ret = mr->device->destroy_mr(mr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_destroy_mr); - -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +/** + * ib_alloc_mr() - Allocates a memory region + * @pd: protection domain associated with the region + * @mr_type: memory region type + * @max_num_sg: maximum sg entries available for registration. + * + * Notes: + * Memory registeration page/sg lists must not exceed max_num_sg. + * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed + * max_num_sg * used_page_size. + * + */ +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) { struct ib_mr *mr; - if (!pd->device->alloc_fast_reg_mr) + if (!pd->device->alloc_mr) return ERR_PTR(-ENOSYS); - mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); - + mr = pd->device->alloc_mr(pd, mr_type, max_num_sg); if (!IS_ERR(mr)) { mr->device = pd->device; mr->pd = pd; @@ -1228,32 +1278,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) return mr; } -EXPORT_SYMBOL(ib_alloc_fast_reg_mr); - -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, - int max_page_list_len) -{ - struct ib_fast_reg_page_list *page_list; - - if (!device->alloc_fast_reg_page_list) - return ERR_PTR(-ENOSYS); - - page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); - - if (!IS_ERR(page_list)) { - page_list->device = device; - page_list->max_page_list_len = max_page_list_len; - } - - return page_list; -} -EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); - -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) -{ - page_list->device->free_fast_reg_page_list(page_list); -} -EXPORT_SYMBOL(ib_free_fast_reg_page_list); +EXPORT_SYMBOL(ib_alloc_mr); /* Memory windows */ @@ -1446,3 +1471,111 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; } EXPORT_SYMBOL(ib_check_mr_status); + +/** + * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list + * and set it the memory region. + * @mr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @page_size: page vector desired page size + * + * Constraints: + * - The first sg element is allowed to have an offset. + * - Each sg element must be aligned to page_size (or physically + * contiguous to the previous element). In case an sg element has a + * non contiguous offset, the mapping prefix will not include it. + * - The last sg element is allowed to have length less than page_size. + * - If sg_nents total byte length exceeds the mr max_num_sge * page_size + * then only max_num_sg entries will be mapped. + * + * Returns the number of sg elements that were mapped to the memory region. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents, + unsigned int page_size) +{ + if (unlikely(!mr->device->map_mr_sg)) + return -ENOSYS; + + mr->page_size = page_size; + + return mr->device->map_mr_sg(mr, sg, sg_nents); +} +EXPORT_SYMBOL(ib_map_mr_sg); + +/** + * ib_sg_to_pages() - Convert the largest prefix of a sg list + * to a page vector + * @mr: memory region + * @sgl: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @set_page: driver page assignment function pointer + * + * Core service helper for drivers to convert the largest + * prefix of given sg list to a page vector. The sg list + * prefix converted is the prefix that meet the requirements + * of ib_map_mr_sg. + * + * Returns the number of sg elements that were assigned to + * a page vector. + */ +int ib_sg_to_pages(struct ib_mr *mr, + struct scatterlist *sgl, + int sg_nents, + int (*set_page)(struct ib_mr *, u64)) +{ + struct scatterlist *sg; + u64 last_end_dma_addr = 0, last_page_addr = 0; + unsigned int last_page_off = 0; + u64 page_mask = ~((u64)mr->page_size - 1); + int i, ret; + + mr->iova = sg_dma_address(&sgl[0]); + mr->length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + u64 dma_addr = sg_dma_address(sg); + unsigned int dma_len = sg_dma_len(sg); + u64 end_dma_addr = dma_addr + dma_len; + u64 page_addr = dma_addr & page_mask; + + /* + * For the second and later elements, check whether either the + * end of element i-1 or the start of element i is not aligned + * on a page boundary. + */ + if (i && (last_page_off != 0 || page_addr != dma_addr)) { + /* Stop mapping if there is a gap. */ + if (last_end_dma_addr != dma_addr) + break; + + /* + * Coalesce this element with the last. If it is small + * enough just update mr->length. Otherwise start + * mapping from the next page. + */ + goto next_page; + } + + do { + ret = set_page(mr, page_addr); + if (unlikely(ret < 0)) + return i ? : ret; +next_page: + page_addr += mr->page_size; + } while (page_addr < end_dma_addr); + + mr->length += dma_len; + last_end_dma_addr = end_dma_addr; + last_page_addr = end_dma_addr & page_mask; + last_page_off = end_dma_addr & ~page_mask; + } + + return i; +} +EXPORT_SYMBOL(ib_sg_to_pages); |