diff options
Diffstat (limited to 'kernel/drivers/net/vxlan.c')
-rw-r--r-- | kernel/drivers/net/vxlan.c | 984 |
1 files changed, 625 insertions, 359 deletions
diff --git a/kernel/drivers/net/vxlan.c b/kernel/drivers/net/vxlan.c index 0085b8df8..e0fcda4dd 100644 --- a/kernel/drivers/net/vxlan.c +++ b/kernel/drivers/net/vxlan.c @@ -49,15 +49,12 @@ #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #endif +#include <net/dst_metadata.h> #define VXLAN_VERSION "0.1" #define PORT_HASH_BITS 8 #define PORT_HASH_SIZE (1<<PORT_HASH_BITS) -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) -#define FDB_HASH_BITS 8 -#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ @@ -74,9 +71,12 @@ module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int vxlan_net_id; +static struct rtnl_link_ops vxlan_link_ops; static const u8 all_zeros_mac[ETH_ALEN]; +static int vxlan_sock_add(struct vxlan_dev *vxlan); + /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -84,21 +84,6 @@ struct vxlan_net { spinlock_t sock_lock; }; -union vxlan_addr { - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - struct sockaddr sa; -}; - -struct vxlan_rdst { - union vxlan_addr remote_ip; - __be16 remote_port; - u32 remote_vni; - u32 remote_ifindex; - struct list_head list; - struct rcu_head rcu; -}; - /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ @@ -106,40 +91,21 @@ struct vxlan_fdb { unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; + u8 eth_addr[ETH_ALEN]; u16 state; /* see ndm_state */ u8 flags; /* see ndm_flags */ - u8 eth_addr[ETH_ALEN]; -}; - -/* Pseudo network device */ -struct vxlan_dev { - struct hlist_node hlist; /* vni hash table */ - struct list_head next; /* vxlan's per namespace list */ - struct vxlan_sock *vn_sock; /* listening socket */ - struct net_device *dev; - struct net *net; /* netns for packet i/o */ - struct vxlan_rdst default_dst; /* default destination */ - union vxlan_addr saddr; /* source address */ - __be16 dst_port; - __u16 port_min; /* source port range */ - __u16 port_max; - __u8 tos; /* TOS override */ - __u8 ttl; - u32 flags; /* VXLAN_F_* in vxlan.h */ - - unsigned long age_interval; - struct timer_list age_timer; - spinlock_t hash_lock; - unsigned int addrcnt; - unsigned int addrmax; - - struct hlist_head fdb_head[FDB_HASH_SIZE]; }; /* salt for hash table */ static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) +{ + return vs->flags & VXLAN_F_COLLECT_METADATA || + ip_tunnel_collect_metadata(); +} + #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) @@ -269,7 +235,7 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && - inet_sk(vs->sock->sk)->sk.sk_family == family && + vxlan_get_sk_family(vs) == family && vs->flags == flags) return vs; } @@ -336,7 +302,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, - peernet2id(dev_net(vxlan->dev), vxlan->net))) + peernet2id_alloc(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) @@ -345,7 +311,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; - if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && + if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && @@ -552,10 +518,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, u32 data, struct gro_remcsum *grc, bool nopartial) { - size_t start, offset, plen; + size_t start, offset; if (skb->remcsum_offload) - return NULL; + return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; @@ -565,17 +531,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, offsetof(struct udphdr, check) : offsetof(struct tcphdr, check)); - plen = hdrlen + offset + sizeof(u16); - - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - vh = skb_gro_header_slow(skb, off + plen, off); - if (!vh) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)vh + hdrlen, - start, offset, grc, nopartial); + vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -606,7 +563,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } - skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = ntohl(vh->vx_flags); @@ -621,6 +577,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + flush = 0; for (p = *head; p; p = p->next) { @@ -658,7 +616,7 @@ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = vxlan_get_sk_family(vs); __be16 port = inet_sk(sk)->inet_sport; int err; @@ -683,7 +641,7 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) struct net_device *dev; struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); - sa_family_t sa_family = sk->sk_family; + sa_family_t sa_family = vxlan_get_sk_family(vs); __be16 port = inet_sk(sk)->inet_sport; rcu_read_lock(); @@ -749,7 +707,8 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (!(flags & NLM_F_CREATE)) return -ENOENT; - if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) + if (vxlan->cfg.addrmax && + vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; /* Disallow replace to add a multicast entry */ @@ -835,7 +794,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, return -EINVAL; *port = nla_get_be16(tb[NDA_PORT]); } else { - *port = vxlan->dst_port; + *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { @@ -963,10 +922,10 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; - if (idx < cb->args[0]) - goto skip; - list_for_each_entry_rcu(rd, &f->remotes, list) { + if (idx < cb->args[0]) + goto skip; + err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -974,9 +933,9 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, NLM_F_MULTI, rd); if (err < 0) goto out; - } skip: - ++idx; + ++idx; + } } } out: @@ -1021,7 +980,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan_fdb_create(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, - vxlan->dst_port, + vxlan->cfg.dst_port, vxlan->default_dst.remote_vni, 0, NTF_SELF); spin_unlock(&vxlan->hash_lock); @@ -1034,19 +993,30 @@ static bool vxlan_snoop(struct net_device *dev, static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) { struct vxlan_dev *vxlan; + unsigned short family = dev->default_dst.remote_ip.sa.sa_family; /* The vxlan_sock is only used by dev, leaving group has * no effect on other vxlan devices. */ - if (atomic_read(&dev->vn_sock->refcnt) == 1) + if (family == AF_INET && dev->vn4_sock && + atomic_read(&dev->vn4_sock->refcnt) == 1) + return false; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && dev->vn6_sock && + atomic_read(&dev->vn6_sock->refcnt) == 1) return false; +#endif list_for_each_entry(vxlan, &vn->vxlan_list, next) { if (!netif_running(vxlan->dev) || vxlan == dev) continue; - if (vxlan->vn_sock != dev->vn_sock) + if (family == AF_INET && vxlan->vn4_sock != dev->vn4_sock) continue; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && vxlan->vn6_sock != dev->vn6_sock) + continue; +#endif if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, &dev->default_dst.remote_ip)) @@ -1062,15 +1032,16 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -void vxlan_sock_release(struct vxlan_sock *vs) +static void __vxlan_sock_release(struct vxlan_sock *vs) { - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn; + if (!vs) + return; if (!atomic_dec_and_test(&vs->refcnt)) return; + vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); vxlan_notify_del_rx_port(vs); @@ -1078,34 +1049,44 @@ void vxlan_sock_release(struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } -EXPORT_SYMBOL_GPL(vxlan_sock_release); + +static void vxlan_sock_release(struct vxlan_dev *vxlan) +{ + __vxlan_sock_release(vxlan->vn4_sock); +#if IS_ENABLED(CONFIG_IPV6) + __vxlan_sock_release(vxlan->vn6_sock); +#endif +} /* Update multicast group membership when first VNI on * multicast address is brought up */ static int vxlan_igmp_join(struct vxlan_dev *vxlan) { - struct vxlan_sock *vs = vxlan->vn_sock; - struct sock *sk = vs->sock->sk; + struct sock *sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; int ret = -EINVAL; - lock_sock(sk); if (ip->sa.sa_family == AF_INET) { struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; + sk = vxlan->vn4_sock->sock->sk; + lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { + sk = vxlan->vn6_sock->sock->sk; + lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, &ip->sin6.sin6_addr); + release_sock(sk); #endif } - release_sock(sk); return ret; } @@ -1113,27 +1094,30 @@ static int vxlan_igmp_join(struct vxlan_dev *vxlan) /* Inverse of vxlan_igmp_join when last VNI is brought down */ static int vxlan_igmp_leave(struct vxlan_dev *vxlan) { - struct vxlan_sock *vs = vxlan->vn_sock; - struct sock *sk = vs->sock->sk; + struct sock *sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; int ret = -EINVAL; - lock_sock(sk); if (ip->sa.sa_family == AF_INET) { struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, .imr_ifindex = ifindex, }; + sk = vxlan->vn4_sock->sock->sk; + lock_sock(sk); ret = ip_mc_leave_group(sk, &mreq); + release_sock(sk); #if IS_ENABLED(CONFIG_IPV6) } else { + sk = vxlan->vn6_sock->sock->sk; + lock_sock(sk); ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, &ip->sin6.sin6_addr); + release_sock(sk); #endif } - release_sock(sk); return ret; } @@ -1143,6 +1127,9 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, { size_t start, offset, plen; + if (skb->remcsum_offload) + return vh; + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; offset = start + ((data & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : @@ -1161,13 +1148,109 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, return vh; } +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md, u32 vni, + struct metadata_dst *tun_dst) +{ + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; + struct vxlan_dev *vxlan; + struct pcpu_sw_netstats *stats; + union vxlan_addr saddr; + int err = 0; + + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_COLLECT_METADATA) + vni = 0; + + /* Is this VNI defined? */ + vxlan = vxlan_vs_find_vni(vs, vni); + if (!vxlan) + goto drop; + + skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); + skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) + goto drop; + + /* Get data from the outer IP header */ + if (vxlan_get_sk_family(vs) == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + + if (tun_dst) { + skb_dst_set(skb, (struct dst_entry *)tun_dst); + tun_dst = NULL; + } + + if ((vxlan->flags & VXLAN_F_LEARN) && + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) + goto drop; + + skb_reset_network_header(skb); + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_COLLECT_METADATA)) + skb->mark = md->gbp; + + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + + if (unlikely(err)) { + if (log_ecn_error) { + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } + if (err > 1) { + ++vxlan->dev->stats.rx_frame_errors; + ++vxlan->dev->stats.rx_errors; + goto drop; + } + } + + stats = this_cpu_ptr(vxlan->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + gro_cells_receive(&vxlan->gro_cells, skb); + + return; +drop: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + + /* Consume bad packet */ + kfree_skb(skb); +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { + struct metadata_dst *tun_dst = NULL; struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; - struct vxlan_metadata md = {0}; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -1202,6 +1285,18 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vni &= VXLAN_VNI_MASK; } + if (vxlan_collect_metadata(vs)) { + tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, + cpu_to_be64(vni >> 8), sizeof(*md)); + + if (!tun_dst) + goto drop; + + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); + } else { + memset(md, 0, sizeof(*md)); + } + /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ @@ -1209,13 +1304,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; - md.gbp = ntohs(gbp->policy_id); + md->gbp = ntohs(gbp->policy_id); + + if (tun_dst) + tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) - md.gbp |= VXLAN_GBP_DONT_LEARN; + md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) - md.gbp |= VXLAN_GBP_POLICY_APPLIED; + md->gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } @@ -1233,8 +1331,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md.vni = vxh->vx_vni; - vs->rcv(vs, skb, &md); + vxlan_rcv(vs, skb, md, vni >> 8, tun_dst); return 0; drop: @@ -1247,93 +1344,13 @@ bad_flags: ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + /* Return non vxlan pkt */ return 1; } -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct iphdr *oip = NULL; - struct ipv6hdr *oip6 = NULL; - struct vxlan_dev *vxlan; - struct pcpu_sw_netstats *stats; - union vxlan_addr saddr; - __u32 vni; - int err = 0; - union vxlan_addr *remote_ip; - - vni = ntohl(md->vni) >> 8; - /* Is this VNI defined? */ - vxlan = vxlan_vs_find_vni(vs, vni); - if (!vxlan) - goto drop; - - remote_ip = &vxlan->default_dst.remote_ip; - skb_reset_mac_header(skb); - skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); - skb->protocol = eth_type_trans(skb, vxlan->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - goto drop; - - /* Re-examine inner Ethernet packet */ - if (remote_ip->sa.sa_family == AF_INET) { - oip = ip_hdr(skb); - saddr.sin.sin_addr.s_addr = oip->saddr; - saddr.sa.sa_family = AF_INET; -#if IS_ENABLED(CONFIG_IPV6) - } else { - oip6 = ipv6_hdr(skb); - saddr.sin6.sin6_addr = oip6->saddr; - saddr.sa.sa_family = AF_INET6; -#endif - } - - if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) - goto drop; - - skb_reset_network_header(skb); - skb->mark = md->gbp; - - if (oip6) - err = IP6_ECN_decapsulate(oip6, skb); - if (oip) - err = IP_ECN_decapsulate(oip, skb); - - if (unlikely(err)) { - if (log_ecn_error) { - if (oip6) - net_info_ratelimited("non-ECT from %pI6\n", - &oip6->saddr); - if (oip) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); - } - if (err > 1) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; - goto drop; - } - } - - stats = this_cpu_ptr(vxlan->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netif_rx(skb); - - return; -drop: - /* Consume bad packet */ - kfree_skb(skb); -} - static int arp_reduce(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1672,7 +1689,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, - __be16 src_port, __be16 dst_port, + __be16 src_port, __be16 dst_port, __be32 vni, struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; @@ -1722,7 +1739,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1755,10 +1772,10 @@ err: } #endif -int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; @@ -1801,7 +1818,7 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1828,7 +1845,34 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ttl, df, src_port, dst_port, xnet, !(vxflags & VXLAN_F_UDP_CSUM)); } -EXPORT_SYMBOL_GPL(vxlan_xmit_skb); + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, + struct sk_buff *skb, int oif, + const struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct dst_entry *ndst; + struct flowi6 fl6; + int err; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.daddr = *daddr; + fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = IPPROTO_UDP; + + err = ipv6_stub->ipv6_dst_lookup(vxlan->net, + vxlan->vn6_sock->sock->sk, + &ndst, &fl6); + if (err < 0) + return ERR_PTR(err); + + *saddr = fl6.saddr; + return ndst; +} +#endif /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, @@ -1878,22 +1922,44 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk = vxlan->vn_sock->sock->sk; + struct sock *sk; struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; - struct vxlan_metadata md; + union vxlan_addr remote_ip; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; int err; + u32 flags = vxlan->flags; + + info = skb_tunnel_info(skb); - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; - vni = rdst->remote_vni; - dst = &rdst->remote_ip; + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; + } else { + if (!info) { + WARN_ONCE(1, "%s: Missing encapsulation instructions\n", + dev->name); + goto drop; + } + dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; + vni = be64_to_cpu(info->key.tun_id); + remote_ip.sa.sa_family = ip_tunnel_info_af(info); + if (remote_ip.sa.sa_family == AF_INET) + remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; + else + remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; + dst = &remote_ip; + } if (vxlan_addr_any(dst)) { if (did_rsc) { @@ -1906,23 +1972,49 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph = ip_hdr(skb); - ttl = vxlan->ttl; + ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; - tos = vxlan->tos; + tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); - src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min, - vxlan->port_max, true); + src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); + + if (info) { + ttl = info->key.ttl; + tos = info->key.tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info); + } else { + md->gbp = skb->mark; + } if (dst->sa.sa_family == AF_INET) { + if (!vxlan->vn4_sock) + goto drop; + sk = vxlan->vn4_sock->sock->sk; + + if (info) { + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + } + memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; fl4.flowi4_tos = RT_TOS(tos); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = IPPROTO_UDP; fl4.daddr = dst->sin.sin_addr.s_addr; - fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; + fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; rt = ip_route_output_key(vxlan->net, &fl4); if (IS_ERR(rt)) { @@ -1956,14 +2048,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; - err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, &md, + src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1974,16 +2063,17 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, #if IS_ENABLED(CONFIG_IPV6) } else { struct dst_entry *ndst; - struct flowi6 fl6; - u32 flags; + struct in6_addr saddr; + u32 rt6i_flags; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = rdst->remote_ifindex; - fl6.daddr = dst->sin6.sin6_addr; - fl6.saddr = vxlan->saddr.sin6.sin6_addr; - fl6.flowi6_proto = IPPROTO_UDP; + if (!vxlan->vn6_sock) + goto drop; + sk = vxlan->vn6_sock->sock->sk; - if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { + ndst = vxlan6_get_route(vxlan, skb, + rdst ? rdst->remote_ifindex : 0, + &dst->sin6.sin6_addr, &saddr); + if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; @@ -1999,9 +2089,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } /* Bypass encapsulation if the destination is local */ - flags = ((struct rt6_info *)ndst)->rt6i_flags; - if (flags & RTF_LOCAL && - !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (rt6i_flags & RTF_LOCAL && + !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan; dst_release(ndst); @@ -2014,14 +2104,18 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; } - ttl = ttl ? : ip6_dst_hoplimit(ndst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; + if (info) { + if (info->key.tun_flags & TUNNEL_CSUM) + flags &= ~VXLAN_F_UDP_ZERO_CSUM6_TX; + else + flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; + } - err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, &md, + ttl = ttl ? : ip6_dst_hoplimit(ndst); + err = vxlan6_xmit_skb(ndst, sk, skb, dev, &saddr, &dst->sin6.sin6_addr, + 0, ttl, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); #endif } @@ -2048,11 +2142,14 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + const struct ip_tunnel_info *info; struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; struct vxlan_fdb *f; + info = skb_tunnel_info(skb); + skb_reset_mac_header(skb); eth = eth_hdr(skb); @@ -2075,6 +2172,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } + if (vxlan->flags & VXLAN_F_COLLECT_METADATA && + info && info->mode & IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } + f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; @@ -2128,9 +2231,10 @@ static void vxlan_cleanup(unsigned long arg) if (!netif_running(vxlan->dev)) return; - spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; + + spin_lock_bh(&vxlan->hash_lock); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -2139,7 +2243,7 @@ static void vxlan_cleanup(unsigned long arg) if (f->state & NUD_PERMANENT) continue; - timeout = f->used + vxlan->age_interval * HZ; + timeout = f->used + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", @@ -2149,8 +2253,8 @@ static void vxlan_cleanup(unsigned long arg) } else if (time_before(timeout, next_timer)) next_timer = timeout; } + spin_unlock_bh(&vxlan->hash_lock); } - spin_unlock_bh(&vxlan->hash_lock); mod_timer(&vxlan->age_timer, next_timer); } @@ -2160,7 +2264,6 @@ static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __u32 vni = vxlan->default_dst.remote_vni; - vxlan->vn_sock = vs; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); @@ -2200,27 +2303,23 @@ static void vxlan_uninit(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_sock *vs; - int ret = 0; + int ret; - vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL, - false, vxlan->flags); - if (IS_ERR(vs)) - return PTR_ERR(vs); - - vxlan_vs_add_dev(vs, vxlan); + ret = vxlan_sock_add(vxlan); + if (ret < 0) + return ret; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { ret = vxlan_igmp_join(vxlan); if (ret == -EADDRINUSE) ret = 0; if (ret) { - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } } - if (vxlan->age_interval) + if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; @@ -2250,7 +2349,6 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - struct vxlan_sock *vs = vxlan->vn_sock; int ret = 0; if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && @@ -2260,7 +2358,7 @@ static int vxlan_stop(struct net_device *dev) del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan); - vxlan_sock_release(vs); + vxlan_sock_release(vxlan); return ret; } @@ -2293,6 +2391,67 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) return 0; } +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb, + struct ip_tunnel_info *info, + __be16 sport, __be16 dport) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_tos = RT_TOS(info->key.tos); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = IPPROTO_UDP; + fl4.daddr = info->key.u.ipv4.dst; + + rt = ip_route_output_key(vxlan->net, &fl4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + ip_rt_put(rt); + + info->key.u.ipv4.src = fl4.saddr; + info->key.tp_src = sport; + info->key.tp_dst = dport; + return 0; +} + +static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct ip_tunnel_info *info = skb_tunnel_info(skb); + __be16 sport, dport; + + sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); + dport = info->key.tp_dst ? : vxlan->cfg.dst_port; + + if (ip_tunnel_info_af(info) == AF_INET) { + if (!vxlan->vn4_sock) + return -EINVAL; + return egress_ipv4_tun_info(dev, skb, info, sport, dport); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct dst_entry *ndst; + + if (!vxlan->vn6_sock) + return -EINVAL; + ndst = vxlan6_get_route(vxlan, skb, 0, + &info->key.u.ipv6.dst, + &info->key.u.ipv6.src); + if (IS_ERR(ndst)) + return PTR_ERR(ndst); + dst_release(ndst); + + info->key.tp_src = sport; + info->key.tp_dst = dport; +#else /* !CONFIG_IPV6 */ + return -EPFNOSUPPORT; +#endif + } + return 0; +} + static const struct net_device_ops vxlan_netdev_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, @@ -2307,6 +2466,7 @@ static const struct net_device_ops vxlan_netdev_ops = { .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_dump = vxlan_fdb_dump, + .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ @@ -2331,7 +2491,7 @@ void vxlan_get_rx_port(struct net_device *dev) for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { port = inet_sk(vs->sock->sk)->inet_sport; - sa_family = vs->sock->sk->sk_family; + sa_family = vxlan_get_sk_family(vs); dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, port); } @@ -2348,16 +2508,11 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) - dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; - else - dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; SET_NETDEV_DEVTYPE(dev, &vxlan_type); - dev->tx_queue_len = 0; dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; dev->features |= NETIF_F_RXCSUM; @@ -2369,7 +2524,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netif_keep_dst(dev); - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); @@ -2378,10 +2533,12 @@ static void vxlan_setup(struct net_device *dev) vxlan->age_timer.function = vxlan_cleanup; vxlan->age_timer.data = (unsigned long) vxlan; - vxlan->dst_port = htons(vxlan_port); + vxlan->cfg.dst_port = htons(vxlan_port); vxlan->dev = dev; + gro_cells_init(&vxlan->gro_cells, dev); + for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } @@ -2403,6 +2560,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, @@ -2482,6 +2640,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); + udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } @@ -2497,15 +2656,13 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, } /* Create new listen socket if needed */ -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - u32 flags) +static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, + __be16 port, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; - bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); @@ -2527,8 +2684,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, vs->sock = sock; atomic_set(&vs->refcnt, 1); - vs->rcv = rcv; - vs->data = data; vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ @@ -2552,79 +2707,89 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags) +static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; - bool ipv6 = flags & VXLAN_F_IPV6; + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_sock *vs = NULL; - if (!no_share) { + if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, - flags); - if (vs && vs->rcv == rcv) { - if (!atomic_add_unless(&vs->refcnt, 1, 0)) - vs = ERR_PTR(-EBUSY); + vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, + vxlan->cfg.dst_port, vxlan->flags); + if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) { spin_unlock(&vn->sock_lock); - return vs; + return -EBUSY; } spin_unlock(&vn->sock_lock); } + if (!vs) + vs = vxlan_socket_create(vxlan->net, ipv6, + vxlan->cfg.dst_port, vxlan->flags); + if (IS_ERR(vs)) + return PTR_ERR(vs); +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + vxlan->vn6_sock = vs; + else +#endif + vxlan->vn4_sock = vs; + vxlan_vs_add_dev(vs, vxlan); + return 0; +} - return vxlan_socket_create(net, port, rcv, data, flags); +static int vxlan_sock_add(struct vxlan_dev *vxlan) +{ + bool ipv6 = vxlan->flags & VXLAN_F_IPV6; + bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA; + int ret = 0; + + vxlan->vn4_sock = NULL; +#if IS_ENABLED(CONFIG_IPV6) + vxlan->vn6_sock = NULL; + if (ipv6 || metadata) + ret = __vxlan_sock_add(vxlan, true); +#endif + if (!ret && (!ipv6 || metadata)) + ret = __vxlan_sock_add(vxlan, false); + if (ret < 0) + vxlan_sock_release(vxlan); + return ret; } -EXPORT_SYMBOL_GPL(vxlan_sock_add); -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) +static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, + struct vxlan_config *conf) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); - struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_dev *vxlan = netdev_priv(dev), *tmp; struct vxlan_rdst *dst = &vxlan->default_dst; - __u32 vni; + unsigned short needed_headroom = ETH_HLEN; int err; bool use_ipv6 = false; - - if (!data[IFLA_VXLAN_ID]) - return -EINVAL; + __be16 default_port = vxlan->cfg.dst_port; vxlan->net = src_net; - vni = nla_get_u32(data[IFLA_VXLAN_ID]); - dst->remote_vni = vni; + dst->remote_vni = conf->vni; - /* Unless IPv6 is explicitly requested, assume IPv4 */ - dst->remote_ip.sa.sa_family = AF_INET; - if (data[IFLA_VXLAN_GROUP]) { - dst->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); - } else if (data[IFLA_VXLAN_GROUP6]) { - if (!IS_ENABLED(CONFIG_IPV6)) - return -EPFNOSUPPORT; + memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); - dst->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); - dst->remote_ip.sa.sa_family = AF_INET6; - use_ipv6 = true; - } + /* Unless IPv6 is explicitly requested, assume IPv4 */ + if (!dst->remote_ip.sa.sa_family) + dst->remote_ip.sa.sa_family = AF_INET; - if (data[IFLA_VXLAN_LOCAL]) { - vxlan->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); - vxlan->saddr.sa.sa_family = AF_INET; - } else if (data[IFLA_VXLAN_LOCAL6]) { + if (dst->remote_ip.sa.sa_family == AF_INET6 || + vxlan->cfg.saddr.sa.sa_family == AF_INET6) { if (!IS_ENABLED(CONFIG_IPV6)) return -EPFNOSUPPORT; - - /* TODO: respect scope id */ - vxlan->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); - vxlan->saddr.sa.sa_family = AF_INET6; use_ipv6 = true; + vxlan->flags |= VXLAN_F_IPV6; } - if (data[IFLA_VXLAN_LINK] && - (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { + if (conf->remote_ifindex) { struct net_device *lowerdev - = __dev_get_by_index(src_net, dst->remote_ifindex); + = __dev_get_by_index(src_net, conf->remote_ifindex); + + dst->remote_ifindex = conf->remote_ifindex; if (!lowerdev) { pr_info("ifindex %d does not exist\n", dst->remote_ifindex); @@ -2638,113 +2803,208 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, pr_info("IPv6 is disabled via sysctl\n"); return -EPERM; } - vxlan->flags |= VXLAN_F_IPV6; } #endif - if (!tb[IFLA_MTU]) + if (!conf->mtu) dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - dev->needed_headroom = lowerdev->hard_header_len + - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - } else if (use_ipv6) - vxlan->flags |= VXLAN_F_IPV6; + needed_headroom = lowerdev->hard_header_len; + } + + if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) + needed_headroom += VXLAN6_HEADROOM; + else + needed_headroom += VXLAN_HEADROOM; + dev->needed_headroom = needed_headroom; + + memcpy(&vxlan->cfg, conf, sizeof(*conf)); + if (!vxlan->cfg.dst_port) + vxlan->cfg.dst_port = default_port; + vxlan->flags |= conf->flags; + + if (!vxlan->cfg.age_interval) + vxlan->cfg.age_interval = FDB_AGE_DEFAULT; + + list_for_each_entry(tmp, &vn->vxlan_list, next) { + if (tmp->cfg.vni == conf->vni && + (tmp->default_dst.remote_ip.sa.sa_family == AF_INET6 || + tmp->cfg.saddr.sa.sa_family == AF_INET6) == use_ipv6 && + tmp->cfg.dst_port == vxlan->cfg.dst_port && + (tmp->flags & VXLAN_F_RCV_FLAGS) == + (vxlan->flags & VXLAN_F_RCV_FLAGS)) + return -EEXIST; + } + + dev->ethtool_ops = &vxlan_ethtool_ops; + + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { + err = vxlan_fdb_create(vxlan, all_zeros_mac, + &vxlan->default_dst.remote_ip, + NUD_REACHABLE|NUD_PERMANENT, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->cfg.dst_port, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, + NTF_SELF); + if (err) + return err; + } + + err = register_netdevice(dev); + if (err) { + vxlan_fdb_delete_default(vxlan); + return err; + } + + list_add(&vxlan->next, &vn->vxlan_list); + + return 0; +} + +struct net_device *vxlan_dev_create(struct net *net, const char *name, + u8 name_assign_type, struct vxlan_config *conf) +{ + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, name, name_assign_type, + &vxlan_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = vxlan_dev_configure(net, dev, conf); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + + return dev; +} +EXPORT_SYMBOL_GPL(vxlan_dev_create); + +static int vxlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct vxlan_config conf; + int err; + + memset(&conf, 0, sizeof(conf)); + + if (data[IFLA_VXLAN_ID]) + conf.vni = nla_get_u32(data[IFLA_VXLAN_ID]); + + if (data[IFLA_VXLAN_GROUP]) { + conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); + } else if (data[IFLA_VXLAN_GROUP6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); + conf.remote_ip.sa.sa_family = AF_INET6; + } + + if (data[IFLA_VXLAN_LOCAL]) { + conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); + conf.saddr.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_LOCAL6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + /* TODO: respect scope id */ + conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); + conf.saddr.sa.sa_family = AF_INET6; + } + + if (data[IFLA_VXLAN_LINK]) + conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); if (data[IFLA_VXLAN_TOS]) - vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); + conf.tos = nla_get_u8(data[IFLA_VXLAN_TOS]); if (data[IFLA_VXLAN_TTL]) - vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); + conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) - vxlan->flags |= VXLAN_F_LEARN; + conf.flags |= VXLAN_F_LEARN; if (data[IFLA_VXLAN_AGEING]) - vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); - else - vxlan->age_interval = FDB_AGE_DEFAULT; + conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) - vxlan->flags |= VXLAN_F_PROXY; + conf.flags |= VXLAN_F_PROXY; if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) - vxlan->flags |= VXLAN_F_RSC; + conf.flags |= VXLAN_F_RSC; if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) - vxlan->flags |= VXLAN_F_L2MISS; + conf.flags |= VXLAN_F_L2MISS; if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) - vxlan->flags |= VXLAN_F_L3MISS; + conf.flags |= VXLAN_F_L3MISS; if (data[IFLA_VXLAN_LIMIT]) - vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + + if (data[IFLA_VXLAN_COLLECT_METADATA] && + nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA])) + conf.flags |= VXLAN_F_COLLECT_METADATA; if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); - vxlan->port_min = ntohs(p->low); - vxlan->port_max = ntohs(p->high); + conf.port_min = ntohs(p->low); + conf.port_max = ntohs(p->high); } if (data[IFLA_VXLAN_PORT]) - vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); + conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) - vxlan->flags |= VXLAN_F_UDP_CSUM; + conf.flags |= VXLAN_F_UDP_CSUM; if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] && nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) - vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; + conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] && nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) - vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; + conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; if (data[IFLA_VXLAN_REMCSUM_TX] && nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) - vxlan->flags |= VXLAN_F_REMCSUM_TX; + conf.flags |= VXLAN_F_REMCSUM_TX; if (data[IFLA_VXLAN_REMCSUM_RX] && nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) - vxlan->flags |= VXLAN_F_REMCSUM_RX; + conf.flags |= VXLAN_F_REMCSUM_RX; if (data[IFLA_VXLAN_GBP]) - vxlan->flags |= VXLAN_F_GBP; + conf.flags |= VXLAN_F_GBP; if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) - vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL; - - if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port, vxlan->flags)) { - pr_info("duplicate VNI %u\n", vni); - return -EEXIST; - } + conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL; - dev->ethtool_ops = &vxlan_ethtool_ops; + err = vxlan_dev_configure(src_net, dev, &conf); + switch (err) { + case -ENODEV: + pr_info("ifindex %d does not exist\n", conf.remote_ifindex); + break; - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &vxlan->default_dst.remote_ip, - NUD_REACHABLE|NUD_PERMANENT, - NLM_F_EXCL|NLM_F_CREATE, - vxlan->dst_port, - vxlan->default_dst.remote_vni, - vxlan->default_dst.remote_ifindex, - NTF_SELF); - if (err) - return err; - } + case -EPERM: + pr_info("IPv6 is disabled via sysctl\n"); + break; - err = register_netdevice(dev); - if (err) { - vxlan_fdb_delete_default(vxlan); - return err; + case -EEXIST: + pr_info("duplicate VNI %u\n", conf.vni); + break; } - list_add(&vxlan->next, &vn->vxlan_list); - - return 0; + return err; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) @@ -2757,6 +3017,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) hlist_del_rcu(&vxlan->hlist); spin_unlock(&vn->sock_lock); + gro_cells_destroy(&vxlan->gro_cells); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); } @@ -2775,6 +3036,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + @@ -2792,8 +3054,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { - .low = htons(vxlan->port_min), - .high = htons(vxlan->port_max), + .low = htons(vxlan->cfg.port_min), + .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) @@ -2816,22 +3078,22 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; - if (!vxlan_addr_any(&vxlan->saddr)) { - if (vxlan->saddr.sa.sa_family == AF_INET) { + if (!vxlan_addr_any(&vxlan->cfg.saddr)) { + if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, - vxlan->saddr.sin.sin_addr.s_addr)) + vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, - &vxlan->saddr.sin6.sin6_addr)) + &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } - if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || - nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || + if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || + nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, @@ -2841,9 +3103,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->flags & VXLAN_F_L3MISS)) || - nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || - nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || - nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || + nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, + !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) || + nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || + nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || + nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !!(vxlan->flags & VXLAN_F_UDP_CSUM)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, @@ -2962,8 +3226,10 @@ static void __net_exit vxlan_exit_net(struct net *net) /* If vxlan->dev is in the same netns, it has already been added * to the list by the previous loop. */ - if (!net_eq(dev_net(vxlan->dev), net)) + if (!net_eq(dev_net(vxlan->dev), net)) { + gro_cells_destroy(&vxlan->gro_cells); unregister_netdevice_queue(vxlan->dev, &list); + } } unregister_netdevice_many(&list); |