summaryrefslogtreecommitdiffstats
path: root/kernel/drivers/infiniband/ulp/ipoib
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/drivers/infiniband/ulp/ipoib')
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib.h37
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c41
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c71
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c300
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c62
-rw-r--r--kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c36
6 files changed, 408 insertions, 139 deletions
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib.h b/kernel/drivers/infiniband/ulp/ipoib/ipoib.h
index bd94b0a6e..3ede10309 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -80,7 +80,7 @@ enum {
IPOIB_NUM_WC = 4,
IPOIB_MAX_PATH_REC_QUEUE = 3,
- IPOIB_MAX_MCAST_QUEUE = 3,
+ IPOIB_MAX_MCAST_QUEUE = 64,
IPOIB_FLAG_OPER_UP = 0,
IPOIB_FLAG_INITIALIZED = 1,
@@ -239,7 +239,7 @@ struct ipoib_cm_tx {
struct net_device *dev;
struct ipoib_neigh *neigh;
struct ipoib_path *path;
- struct ipoib_cm_tx_buf *tx_ring;
+ struct ipoib_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
unsigned long flags;
@@ -342,7 +342,6 @@ struct ipoib_dev_priv {
u16 pkey;
u16 pkey_index;
struct ib_pd *pd;
- struct ib_mr *mr;
struct ib_cq *recv_cq;
struct ib_cq *send_cq;
struct ib_qp *qp;
@@ -361,7 +360,7 @@ struct ipoib_dev_priv {
unsigned tx_head;
unsigned tx_tail;
struct ib_sge tx_sge[MAX_SKB_FRAGS + 1];
- struct ib_send_wr tx_wr;
+ struct ib_ud_wr tx_wr;
unsigned tx_outstanding;
struct ib_wc send_wc[MAX_SEND_CQE];
@@ -496,6 +495,7 @@ void ipoib_dev_cleanup(struct net_device *dev);
void ipoib_mcast_join_task(struct work_struct *work);
void ipoib_mcast_carrier_on_task(struct work_struct *work);
void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
+void ipoib_mcast_free(struct ipoib_mcast *mc);
void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
@@ -504,6 +504,33 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev);
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
+void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
+ struct ipoib_tx_buf *tx_req);
+
+static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
+ struct ipoib_tx_buf *tx_req)
+{
+ int i, off;
+ struct sk_buff *skb = tx_req->skb;
+ skb_frag_t *frags = skb_shinfo(skb)->frags;
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ u64 *mapping = tx_req->mapping;
+
+ if (skb_headlen(skb)) {
+ priv->tx_sge[0].addr = mapping[0];
+ priv->tx_sge[0].length = skb_headlen(skb);
+ off = 1;
+ } else
+ off = 0;
+
+ for (i = 0; i < nr_frags; ++i) {
+ priv->tx_sge[i + off].addr = mapping[i + off];
+ priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
+ }
+ priv->tx_wr.wr.num_sge = nr_frags + off;
+}
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
@@ -522,6 +549,8 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
union ib_gid *mgid, int set_qkey);
+int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
+struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);
int ipoib_init_qp(struct net_device *dev);
int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index cf32a778e..3ae9726ef 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -332,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev,
int i;
for (i = 0; i < priv->cm.num_frags; ++i)
- sge[i].lkey = priv->mr->lkey;
+ sge[i].lkey = priv->pd->local_dma_lkey;
sge[0].length = IPOIB_CM_HEAD_SIZE;
for (i = 1; i < priv->cm.num_frags; ++i)
@@ -694,24 +694,21 @@ repost:
static inline int post_send(struct ipoib_dev_priv *priv,
struct ipoib_cm_tx *tx,
unsigned int wr_id,
- u64 addr, int len)
+ struct ipoib_tx_buf *tx_req)
{
struct ib_send_wr *bad_wr;
- priv->tx_sge[0].addr = addr;
- priv->tx_sge[0].length = len;
+ ipoib_build_sge(priv, tx_req);
- priv->tx_wr.num_sge = 1;
- priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
+ priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM;
- return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+ return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr);
}
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ipoib_cm_tx_buf *tx_req;
- u64 addr;
+ struct ipoib_tx_buf *tx_req;
int rc;
if (unlikely(skb->len > tx->mtu)) {
@@ -735,24 +732,21 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
*/
tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
tx_req->skb = skb;
- addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
- if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+
+ if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
++dev->stats.tx_errors;
dev_kfree_skb_any(skb);
return;
}
- tx_req->mapping = addr;
-
skb_orphan(skb);
skb_dst_drop(skb);
- rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
- addr, skb->len);
+ rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
if (unlikely(rc)) {
ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
- ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+ ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
} else {
dev->trans_start = jiffies;
@@ -777,7 +771,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_cm_tx *tx = wc->qp->qp_context;
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
- struct ipoib_cm_tx_buf *tx_req;
+ struct ipoib_tx_buf *tx_req;
unsigned long flags;
ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -791,7 +785,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
tx_req = &tx->tx_ring[wr_id];
- ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+ ipoib_dma_unmap_tx(priv, tx_req);
/* FIXME: is this right? Shouldn't we only increment on success? */
++dev->stats.tx_packets;
@@ -854,7 +848,7 @@ int ipoib_cm_dev_open(struct net_device *dev)
}
ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
- 0, NULL);
+ 0);
if (ret) {
printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
IPOIB_CM_IETF_ID | priv->qp->qp_num);
@@ -1036,6 +1030,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
struct ib_qp *tx_qp;
+ if (dev->features & NETIF_F_SG)
+ attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
+
tx_qp = ib_create_qp(priv->pd, &attr);
if (PTR_ERR(tx_qp) == -EINVAL) {
ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
@@ -1170,7 +1167,7 @@ err_tx:
static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
{
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
- struct ipoib_cm_tx_buf *tx_req;
+ struct ipoib_tx_buf *tx_req;
unsigned long begin;
ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1197,8 +1194,7 @@ timeout:
while ((int) p->tx_tail - (int) p->tx_head < 0) {
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
- ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
- DMA_TO_DEVICE);
+ ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(tx_req->skb);
++p->tx_tail;
netif_tx_lock_bh(p->dev);
@@ -1455,7 +1451,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
spin_unlock_irq(&priv->lock);
}
-
static ssize_t show_mode(struct device *d, struct device_attribute *attr,
char *buf)
{
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 63b92cbb2..5ea0c1407 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -263,8 +263,7 @@ repost:
"for buf %d\n", wr_id);
}
-static int ipoib_dma_map_tx(struct ib_device *ca,
- struct ipoib_tx_buf *tx_req)
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
{
struct sk_buff *skb = tx_req->skb;
u64 *mapping = tx_req->mapping;
@@ -305,8 +304,8 @@ partial_error:
return -EIO;
}
-static void ipoib_dma_unmap_tx(struct ib_device *ca,
- struct ipoib_tx_buf *tx_req)
+void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
+ struct ipoib_tx_buf *tx_req)
{
struct sk_buff *skb = tx_req->skb;
u64 *mapping = tx_req->mapping;
@@ -314,7 +313,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
int off;
if (skb_headlen(skb)) {
- ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+ ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
+ DMA_TO_DEVICE);
off = 1;
} else
off = 0;
@@ -322,8 +322,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag),
- DMA_TO_DEVICE);
+ ib_dma_unmap_page(priv->ca, mapping[i + off],
+ skb_frag_size(frag), DMA_TO_DEVICE);
}
}
@@ -389,7 +389,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
tx_req = &priv->tx_ring[wr_id];
- ipoib_dma_unmap_tx(priv->ca, tx_req);
+ ipoib_dma_unmap_tx(priv, tx_req);
++dev->stats.tx_packets;
dev->stats.tx_bytes += tx_req->skb->len;
@@ -514,37 +514,23 @@ static inline int post_send(struct ipoib_dev_priv *priv,
void *head, int hlen)
{
struct ib_send_wr *bad_wr;
- int i, off;
struct sk_buff *skb = tx_req->skb;
- skb_frag_t *frags = skb_shinfo(skb)->frags;
- int nr_frags = skb_shinfo(skb)->nr_frags;
- u64 *mapping = tx_req->mapping;
- if (skb_headlen(skb)) {
- priv->tx_sge[0].addr = mapping[0];
- priv->tx_sge[0].length = skb_headlen(skb);
- off = 1;
- } else
- off = 0;
+ ipoib_build_sge(priv, tx_req);
- for (i = 0; i < nr_frags; ++i) {
- priv->tx_sge[i + off].addr = mapping[i + off];
- priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
- }
- priv->tx_wr.num_sge = nr_frags + off;
- priv->tx_wr.wr_id = wr_id;
- priv->tx_wr.wr.ud.remote_qpn = qpn;
- priv->tx_wr.wr.ud.ah = address;
+ priv->tx_wr.wr.wr_id = wr_id;
+ priv->tx_wr.remote_qpn = qpn;
+ priv->tx_wr.ah = address;
if (head) {
- priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size;
- priv->tx_wr.wr.ud.header = head;
- priv->tx_wr.wr.ud.hlen = hlen;
- priv->tx_wr.opcode = IB_WR_LSO;
+ priv->tx_wr.mss = skb_shinfo(skb)->gso_size;
+ priv->tx_wr.header = head;
+ priv->tx_wr.hlen = hlen;
+ priv->tx_wr.wr.opcode = IB_WR_LSO;
} else
- priv->tx_wr.opcode = IB_WR_SEND;
+ priv->tx_wr.wr.opcode = IB_WR_SEND;
- return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+ return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
}
void ipoib_send(struct net_device *dev, struct sk_buff *skb,
@@ -597,9 +583,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
}
if (skb->ip_summed == CHECKSUM_PARTIAL)
- priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+ priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
else
- priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+ priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
if (++priv->tx_outstanding == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
@@ -617,7 +603,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
--priv->tx_outstanding;
- ipoib_dma_unmap_tx(priv->ca, tx_req);
+ ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
if (netif_queue_stopped(dev))
netif_wake_queue(dev);
@@ -868,7 +854,7 @@ int ipoib_ib_dev_stop(struct net_device *dev)
while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
tx_req = &priv->tx_ring[priv->tx_tail &
(ipoib_sendq_size - 1)];
- ipoib_dma_unmap_tx(priv->ca, tx_req);
+ ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail;
--priv->tx_outstanding;
@@ -985,20 +971,21 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv)
}
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
- enum ipoib_flush_level level)
+ enum ipoib_flush_level level,
+ int nesting)
{
struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv->dev;
int result;
- down_read(&priv->vlan_rwsem);
+ down_read_nested(&priv->vlan_rwsem, nesting);
/*
* Flush any child interfaces too -- they might be up even if
* the parent is down.
*/
list_for_each_entry(cpriv, &priv->child_intfs, list)
- __ipoib_ib_dev_flush(cpriv, level);
+ __ipoib_ib_dev_flush(cpriv, level, nesting + 1);
up_read(&priv->vlan_rwsem);
@@ -1076,7 +1063,7 @@ void ipoib_ib_dev_flush_light(struct work_struct *work)
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, flush_light);
- __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
}
void ipoib_ib_dev_flush_normal(struct work_struct *work)
@@ -1084,7 +1071,7 @@ void ipoib_ib_dev_flush_normal(struct work_struct *work)
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, flush_normal);
- __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
}
void ipoib_ib_dev_flush_heavy(struct work_struct *work)
@@ -1092,7 +1079,7 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work)
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, flush_heavy);
- __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
}
void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c b/kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 9e1b203d7..7d3281866 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,9 @@
#include <linux/jhash.h>
#include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
#define DRV_VERSION "1.0.0"
@@ -89,13 +92,18 @@ struct workqueue_struct *ipoib_workqueue;
struct ib_sa_client ipoib_sa_client;
static void ipoib_add_one(struct ib_device *device);
-static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device, void *client_data);
static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_dev_by_params(
+ struct ib_device *dev, u8 port, u16 pkey,
+ const union ib_gid *gid, const struct sockaddr *addr,
+ void *client_data);
static struct ib_client ipoib_client = {
.name = "ipoib",
.add = ipoib_add_one,
- .remove = ipoib_remove_one
+ .remove = ipoib_remove_one,
+ .get_net_dev_by_params = ipoib_get_net_dev_by_params,
};
int ipoib_open(struct net_device *dev)
@@ -190,7 +198,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
struct ipoib_dev_priv *priv = netdev_priv(dev);
if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
- features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
+ features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
return features;
}
@@ -222,6 +230,225 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
+ struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct in_device *in_dev;
+ struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+ __be32 ret_addr;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ return false;
+
+ ret_addr = inet_confirm_addr(net, in_dev, 0,
+ addr_in->sin_addr.s_addr,
+ RT_SCOPE_HOST);
+ in_dev_put(in_dev);
+ if (ret_addr)
+ return true;
+
+ break;
+ case AF_INET6:
+ if (IS_ENABLED(CONFIG_IPV6) &&
+ ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+ return true;
+
+ break;
+ }
+ return false;
+}
+
+/**
+ * Find the master net_device on top of the given net_device.
+ * @dev: base IPoIB net_device
+ *
+ * Returns the master net_device with a reference held, or the same net_device
+ * if no master exists.
+ */
+static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
+{
+ struct net_device *master;
+
+ rcu_read_lock();
+ master = netdev_master_upper_dev_get_rcu(dev);
+ if (master)
+ dev_hold(master);
+ rcu_read_unlock();
+
+ if (master)
+ return master;
+
+ dev_hold(dev);
+ return dev;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(
+ const struct sockaddr *addr, struct net_device *dev)
+{
+ struct net_device *upper,
+ *result = NULL;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
+ dev_hold(dev);
+ result = dev;
+ goto out;
+ }
+
+ netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
+ if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
+ dev_hold(upper);
+ result = upper;
+ break;
+ }
+ }
+out:
+ rcu_read_unlock();
+ return result;
+}
+
+/* returns the number of IPoIB netdevs on top a given ipoib device matching a
+ * pkey_index and address, if one exists.
+ *
+ * @found_net_dev: contains a matching net_device if the return value >= 1,
+ * with a reference held. */
+static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
+ const union ib_gid *gid,
+ u16 pkey_index,
+ const struct sockaddr *addr,
+ int nesting,
+ struct net_device **found_net_dev)
+{
+ struct ipoib_dev_priv *child_priv;
+ struct net_device *net_dev = NULL;
+ int matches = 0;
+
+ if (priv->pkey_index == pkey_index &&
+ (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
+ if (!addr) {
+ net_dev = ipoib_get_master_net_dev(priv->dev);
+ } else {
+ /* Verify the net_device matches the IP address, as
+ * IPoIB child devices currently share a GID. */
+ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+ }
+ if (net_dev) {
+ if (!*found_net_dev)
+ *found_net_dev = net_dev;
+ else
+ dev_put(net_dev);
+ ++matches;
+ }
+ }
+
+ /* Check child interfaces */
+ down_read_nested(&priv->vlan_rwsem, nesting);
+ list_for_each_entry(child_priv, &priv->child_intfs, list) {
+ matches += ipoib_match_gid_pkey_addr(child_priv, gid,
+ pkey_index, addr,
+ nesting + 1,
+ found_net_dev);
+ if (matches > 1)
+ break;
+ }
+ up_read(&priv->vlan_rwsem);
+
+ return matches;
+}
+
+/* Returns the number of matching net_devs found (between 0 and 2). Also
+ * return the matching net_device in the @net_dev parameter, holding a
+ * reference to the net_device, if the number of matches >= 1 */
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+ u16 pkey_index,
+ const union ib_gid *gid,
+ const struct sockaddr *addr,
+ struct net_device **net_dev)
+{
+ struct ipoib_dev_priv *priv;
+ int matches = 0;
+
+ *net_dev = NULL;
+
+ list_for_each_entry(priv, dev_list, list) {
+ if (priv->port != port)
+ continue;
+
+ matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
+ addr, 0, net_dev);
+ if (matches > 1)
+ break;
+ }
+
+ return matches;
+}
+
+static struct net_device *ipoib_get_net_dev_by_params(
+ struct ib_device *dev, u8 port, u16 pkey,
+ const union ib_gid *gid, const struct sockaddr *addr,
+ void *client_data)
+{
+ struct net_device *net_dev;
+ struct list_head *dev_list = client_data;
+ u16 pkey_index;
+ int matches;
+ int ret;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+ if (ret)
+ return NULL;
+
+ if (!dev_list)
+ return NULL;
+
+ /* See if we can find a unique device matching the L2 parameters */
+ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+ gid, NULL, &net_dev);
+
+ switch (matches) {
+ case 0:
+ return NULL;
+ case 1:
+ return net_dev;
+ }
+
+ dev_put(net_dev);
+
+ /* Couldn't find a unique device with L2 parameters only. Use L3
+ * address to uniquely match the net device */
+ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+ gid, addr, &net_dev);
+ switch (matches) {
+ case 0:
+ return NULL;
+ default:
+ dev_warn_ratelimited(&dev->dev,
+ "duplicate IP address detected\n");
+ /* Fall through */
+ case 1:
+ return net_dev;
+ }
+}
+
int ipoib_set_mode(struct net_device *dev, const char *buf)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -232,8 +459,9 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
ipoib_warn(priv, "enabling connected mode "
"will cause multicast packet drops\n");
netdev_update_features(dev);
+ dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
rtnl_unlock();
- priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+ priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
ipoib_flush_paths(dev);
rtnl_lock();
@@ -921,6 +1149,9 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
unsigned long dt;
unsigned long flags;
int i;
+ LIST_HEAD(remove_list);
+ struct ipoib_mcast *mcast, *tmcast;
+ struct net_device *dev = priv->dev;
if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
return;
@@ -948,6 +1179,19 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
lockdep_is_held(&priv->lock))) != NULL) {
/* was the neigh idle for two GC periods */
if (time_after(neigh_obsolete, neigh->alive)) {
+ u8 *mgid = neigh->daddr + 4;
+
+ /* Is this multicast ? */
+ if (*mgid == 0xff) {
+ mcast = __ipoib_mcast_find(dev, mgid);
+
+ if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ list_del(&mcast->list);
+ rb_erase(&mcast->rb_node, &priv->multicast_tree);
+ list_add_tail(&mcast->list, &remove_list);
+ }
+ }
+
rcu_assign_pointer(*np,
rcu_dereference_protected(neigh->hnext,
lockdep_is_held(&priv->lock)));
@@ -963,6 +1207,10 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
out_unlock:
spin_unlock_irqrestore(&priv->lock, flags);
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ ipoib_mcast_leave(dev, mcast);
+ ipoib_mcast_free(mcast);
+ }
}
static void ipoib_reap_neigh(struct work_struct *work)
@@ -1128,7 +1376,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
{
struct ipoib_neigh_table *ntbl = &priv->ntbl;
struct ipoib_neigh_hash *htbl;
- struct ipoib_neigh **buckets;
+ struct ipoib_neigh __rcu **buckets;
u32 size;
clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
@@ -1146,7 +1394,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
htbl->size = size;
htbl->mask = (size - 1);
htbl->buckets = buckets;
- ntbl->htbl = htbl;
+ RCU_INIT_POINTER(ntbl->htbl, htbl);
htbl->ntbl = ntbl;
atomic_set(&ntbl->entries, 0);
@@ -1577,7 +1825,8 @@ static struct net_device *ipoib_add_port(const char *format,
SET_NETDEV_DEV(priv->dev, hca->dma_device);
priv->dev->dev_id = port - 1;
- if (!ib_query_port(hca, port, &attr))
+ result = ib_query_port(hca, port, &attr);
+ if (!result)
priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
else {
printk(KERN_WARNING "%s: ib_query_port %d failed\n",
@@ -1598,7 +1847,8 @@ static struct net_device *ipoib_add_port(const char *format,
goto device_init_failed;
}
- if (ipoib_set_dev_features(priv, hca))
+ result = ipoib_set_dev_features(priv, hca);
+ if (result)
goto device_init_failed;
/*
@@ -1610,7 +1860,7 @@ static struct net_device *ipoib_add_port(const char *format,
priv->dev->broadcast[8] = priv->pkey >> 8;
priv->dev->broadcast[9] = priv->pkey & 0xff;
- result = ib_query_gid(hca, port, 0, &priv->local_gid);
+ result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
if (result) {
printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
hca->name, port, result);
@@ -1684,10 +1934,8 @@ static void ipoib_add_one(struct ib_device *device)
struct list_head *dev_list;
struct net_device *dev;
struct ipoib_dev_priv *priv;
- int s, e, p;
-
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ int p;
+ int count = 0;
dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
if (!dev_list)
@@ -1695,36 +1943,30 @@ static void ipoib_add_one(struct ib_device *device)
INIT_LIST_HEAD(dev_list);
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- s = 0;
- e = 0;
- } else {
- s = 1;
- e = device->phys_port_cnt;
- }
-
- for (p = s; p <= e; ++p) {
- if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+ for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ if (!rdma_protocol_ib(device, p))
continue;
dev = ipoib_add_port("ib%d", device, p);
if (!IS_ERR(dev)) {
priv = netdev_priv(dev);
list_add_tail(&priv->list, dev_list);
+ count++;
}
}
+ if (!count) {
+ kfree(dev_list);
+ return;
+ }
+
ib_set_client_data(device, &ipoib_client, dev_list);
}
-static void ipoib_remove_one(struct ib_device *device)
+static void ipoib_remove_one(struct ib_device *device, void *client_data)
{
struct ipoib_dev_priv *priv, *tmp;
- struct list_head *dev_list;
-
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ struct list_head *dev_list = client_data;
- dev_list = ib_get_client_data(device, &ipoib_client);
if (!dev_list)
return;
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 140c94ce7..6dcef673d 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -106,7 +106,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
}
-static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+void ipoib_mcast_free(struct ipoib_mcast *mcast)
{
struct net_device *dev = mcast->dev;
int tx_dropped = 0;
@@ -153,7 +153,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
return mcast;
}
-static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct rb_node *n = priv->multicast_tree.rb_node;
@@ -245,7 +245,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
spin_unlock_irq(&priv->lock);
- priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+ priv->tx_wr.remote_qkey = priv->qkey;
set_qkey = 1;
}
@@ -393,8 +393,13 @@ static int ipoib_mcast_join_complete(int status,
goto out_locked;
}
} else {
- if (mcast->logcount++ < 20) {
- if (status == -ETIMEDOUT || status == -EAGAIN) {
+ bool silent_fail =
+ test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+ status == -EINVAL;
+
+ if (mcast->logcount < 20) {
+ if (status == -ETIMEDOUT || status == -EAGAIN ||
+ silent_fail) {
ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
mcast->mcmember.mgid.raw, status);
@@ -403,6 +408,9 @@ static int ipoib_mcast_join_complete(int status,
test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
mcast->mcmember.mgid.raw, status);
}
+
+ if (!silent_fail)
+ mcast->logcount++;
}
if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
@@ -448,8 +456,7 @@ out_locked:
return status;
}
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
- int create)
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_sa_multicast *multicast;
@@ -471,7 +478,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
IB_SA_MCMEMBER_REC_PKEY |
IB_SA_MCMEMBER_REC_JOIN_STATE;
- if (create) {
+ if (mcast != priv->broadcast) {
+ /*
+ * RFC 4391:
+ * The MGID MUST use the same P_Key, Q_Key, SL, MTU,
+ * and HopLimit as those used in the broadcast-GID. The rest
+ * of attributes SHOULD follow the values used in the
+ * broadcast-GID as well.
+ */
comp_mask |=
IB_SA_MCMEMBER_REC_QKEY |
IB_SA_MCMEMBER_REC_MTU_SELECTOR |
@@ -492,6 +506,24 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
rec.sl = priv->broadcast->mcmember.sl;
rec.flow_label = priv->broadcast->mcmember.flow_label;
rec.hop_limit = priv->broadcast->mcmember.hop_limit;
+
+ /*
+ * Send-only IB Multicast joins do not work at the core
+ * IB layer yet, so we can't use them here. However,
+ * we are emulating an Ethernet multicast send, which
+ * does not require a multicast subscription and will
+ * still send properly. The most appropriate thing to
+ * do is to create the group if it doesn't exist as that
+ * most closely emulates the behavior, from a user space
+ * application perspecitive, of Ethernet multicast
+ * operation. For now, we do a full join, maybe later
+ * when the core IB layers support send only joins we
+ * will use them.
+ */
+#if 0
+ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+ rec.join_state = 4;
+#endif
}
multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
@@ -517,7 +549,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
struct ib_port_attr port_attr;
unsigned long delay_until = 0;
struct ipoib_mcast *mcast = NULL;
- int create = 1;
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
return;
@@ -530,7 +561,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
}
priv->local_lid = port_attr.lid;
- if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+ if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
ipoib_warn(priv, "ib_query_gid() failed\n");
else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
@@ -566,7 +597,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
mcast = priv->broadcast;
- create = 0;
if (mcast->backoff > 1 &&
time_before(jiffies, mcast->delay_until)) {
delay_until = mcast->delay_until;
@@ -590,12 +620,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
/* Found the next unjoined group */
init_completion(&mcast->done);
set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
- if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
- create = 0;
- else
- create = 1;
spin_unlock_irq(&priv->lock);
- ipoib_mcast_join(dev, mcast, create);
+ ipoib_mcast_join(dev, mcast);
spin_lock_irq(&priv->lock);
} else if (!delay_until ||
time_before(mcast->delay_until, delay_until))
@@ -618,7 +644,7 @@ out:
}
spin_unlock_irq(&priv->lock);
if (mcast)
- ipoib_mcast_join(dev, mcast, create);
+ ipoib_mcast_join(dev, mcast);
}
int ipoib_mcast_start_thread(struct net_device *dev)
@@ -651,7 +677,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
return 0;
}
-static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret = 0;
diff --git a/kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 2d13fd08c..d48c5bae7 100644
--- a/kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -141,6 +141,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_UD
};
+ struct ib_cq_init_attr cq_attr = {};
int ret, size;
int i;
@@ -151,12 +152,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
return -ENODEV;
}
- priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(priv->mr)) {
- printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
- goto out_free_pd;
- }
-
/*
* the various IPoIB tasks assume they will never race against
* themselves, so always use a single thread workqueue
@@ -164,7 +159,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
priv->wq = create_singlethread_workqueue("ipoib_wq");
if (!priv->wq) {
printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
- goto out_free_mr;
+ goto out_free_pd;
}
size = ipoib_recvq_size + 1;
@@ -179,14 +174,17 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
if (ret != -ENOSYS)
goto out_free_wq;
- priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+ cq_attr.cqe = size;
+ priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL,
+ dev, &cq_attr);
if (IS_ERR(priv->recv_cq)) {
printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
goto out_cm_dev_cleanup;
}
+ cq_attr.cqe = ipoib_sendq_size;
priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
- dev, ipoib_sendq_size, 0);
+ dev, &cq_attr);
if (IS_ERR(priv->send_cq)) {
printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
goto out_free_recv_cq;
@@ -221,13 +219,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff;
for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
- priv->tx_sge[i].lkey = priv->mr->lkey;
+ priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
- priv->tx_wr.opcode = IB_WR_SEND;
- priv->tx_wr.sg_list = priv->tx_sge;
- priv->tx_wr.send_flags = IB_SEND_SIGNALED;
+ priv->tx_wr.wr.opcode = IB_WR_SEND;
+ priv->tx_wr.wr.sg_list = priv->tx_sge;
+ priv->tx_wr.wr.send_flags = IB_SEND_SIGNALED;
- priv->rx_sge[0].lkey = priv->mr->lkey;
+ priv->rx_sge[0].lkey = priv->pd->local_dma_lkey;
priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
priv->rx_wr.num_sge = 1;
@@ -250,9 +248,6 @@ out_free_wq:
destroy_workqueue(priv->wq);
priv->wq = NULL;
-out_free_mr:
- ib_dereg_mr(priv->mr);
-
out_free_pd:
ib_dealloc_pd(priv->pd);
@@ -285,12 +280,7 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
priv->wq = NULL;
}
- if (ib_dereg_mr(priv->mr))
- ipoib_warn(priv, "ib_dereg_mr failed\n");
-
- if (ib_dealloc_pd(priv->pd))
- ipoib_warn(priv, "ib_dealloc_pd failed\n");
-
+ ib_dealloc_pd(priv->pd);
}
void ipoib_event(struct ib_event_handler *handler,