diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/net/openvswitch | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/net/openvswitch')
22 files changed, 1972 insertions, 1319 deletions
diff --git a/kernel/net/openvswitch/Kconfig b/kernel/net/openvswitch/Kconfig index ed6b0f8dd..d143aa9f6 100644 --- a/kernel/net/openvswitch/Kconfig +++ b/kernel/net/openvswitch/Kconfig @@ -5,6 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -34,7 +36,7 @@ config OPENVSWITCH config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX + depends on NET_IPGRE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create GRE diff --git a/kernel/net/openvswitch/Makefile b/kernel/net/openvswitch/Makefile index 91b947841..60f809085 100644 --- a/kernel/net/openvswitch/Makefile +++ b/kernel/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif + +obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o -obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/kernel/net/openvswitch/actions.c b/kernel/net/openvswitch/actions.c index b491c1c29..c88d0f2d3 100644 --- a/kernel/net/openvswitch/actions.c +++ b/kernel/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> +#include <linux/netfilter_ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -29,8 +30,10 @@ #include <linux/if_arp.h> #include <linux/if_vlan.h> +#include <net/dst.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/ip6_fib.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/mpls.h> @@ -38,6 +41,7 @@ #include "datapath.h" #include "flow.h" +#include "conntrack.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, @@ -52,6 +56,20 @@ struct deferred_action { struct sw_flow_key pkt_key; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + __u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[MAX_L2_LEN]; +}; + +static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); + #define DEFERRED_ACTION_FIFO_SIZE 10 struct action_fifo { int head; @@ -185,10 +203,6 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, return 0; } -/* 'KEY' must not have any bits set outside of the 'MASK' */ -#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) -#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) - static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { @@ -201,7 +215,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; stack = (__be32 *)skb_mpls_header(skb); - lse = MASKED(*stack, *mpls_lse, *mask); + lse = OVS_MASKED(*stack, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(*stack), lse }; @@ -244,9 +258,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) const u16 *src = (const u16 *)src_; const u16 *mask = (const u16 *)mask_; - SET_MASKED(dst[0], src[0], mask[0]); - SET_MASKED(dst[1], src[1], mask[1]); - SET_MASKED(dst[2], src[2], mask[2]); + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); } static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, @@ -273,28 +287,36 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) +static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, + __be32 addr, __be32 new_addr) { int transport_len = skb->len - skb_transport_offset(skb); + if (nh->frag_off & htons(IP_OFFSET)) + return; + if (nh->protocol == IPPROTO_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); } else if (nh->protocol == IPPROTO_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&uh->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } } +} +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); skb_clear_hash(skb); *addr = new_addr; @@ -308,14 +330,14 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, - addr, new_addr, 1); + addr, new_addr, true); } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&uh->check, skb, - addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -323,17 +345,17 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, } else if (l4_proto == NEXTHDR_ICMP) { if (likely(transport_len >= sizeof(struct icmp6hdr))) inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, - skb, addr, new_addr, 1); + skb, addr, new_addr, true); } } static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], const __be32 mask[4], __be32 masked[4]) { - masked[0] = MASKED(old[0], addr[0], mask[0]); - masked[1] = MASKED(old[1], addr[1], mask[1]); - masked[2] = MASKED(old[2], addr[2], mask[2]); - masked[3] = MASKED(old[3], addr[3], mask[3]); + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); } static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, @@ -350,15 +372,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) { /* Bits 21-24 are always unmasked, so this retains their values. */ - SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, u8 mask) { - new_ttl = MASKED(nh->ttl, new_ttl, mask); + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; @@ -384,7 +406,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, * makes sense to check if the value actually changed. */ if (mask->ipv4_src) { - new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); if (unlikely(new_addr != nh->saddr)) { set_ip_addr(skb, nh, &nh->saddr, new_addr); @@ -392,7 +414,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv4_dst) { - new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); if (unlikely(new_addr != nh->daddr)) { set_ip_addr(skb, nh, &nh->daddr, new_addr); @@ -480,7 +502,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, + mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; @@ -490,7 +513,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { - inet_proto_csum_replace2(check, skb, *port, new_port, 0); + inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; } @@ -509,8 +532,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh = udp_hdr(skb); /* Either of the masks is non-zero, so do not bother checking them. */ - src = MASKED(uh->source, key->udp_src, mask->udp_src); - dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { if (likely(src != uh->source)) { @@ -550,12 +573,12 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; th = tcp_hdr(skb); - src = MASKED(th->source, key->tcp_src, mask->tcp_src); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); if (likely(src != th->source)) { set_tp_port(skb, &th->source, src, &th->check); flow_key->tp.src = src; } - dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); if (likely(dst != th->dest)) { set_tp_port(skb, &th->dest, dst, &th->check); flow_key->tp.dst = dst; @@ -582,8 +605,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, old_csum = sh->checksum; old_correct_csum = sctp_compute_cksum(skb, sctphoff); - sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); - sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); new_csum = sctp_compute_cksum(skb, sctphoff); @@ -597,28 +620,162 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct vport *vport = data->vport; + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + ovs_vport_send(vport, skb); + return 0; +} + +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_frag_data_storage); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, __be16 ethertype) +{ + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + goto err; + } + + if (ethertype == htons(ETH_P_IP)) { + struct dst_entry ovs_dst; + unsigned long orig_dst; + + prepare_frag(vport, skb); + dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (ethertype == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + unsigned long orig_dst; + struct rt6_info ovs_rt; + + if (!v6ops) { + goto err; + } + + prepare_frag(vport, skb); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(ethertype), mru, + vport->dev->mtu); + goto err; + } + + return; +err: + kfree_skb(skb); +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) - ovs_vport_send(vport, skb); - else + if (likely(vport)) { + u16 mru = OVS_CB(skb)->mru; + + if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { + ovs_vport_send(vport, skb); + } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); + __be16 ethertype = key->eth.type; + + if (!is_flow_key_valid(key)) { + if (eth_p_mpls(skb->protocol)) + ethertype = skb->inner_protocol; + else + ethertype = vlan_get_protocol(skb); + } + + ovs_fragment(net, vport, skb, mru, ethertype); + } else { + kfree_skb(skb); + } + } else { kfree_skb(skb); + } } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.userdata = NULL; - upcall.portid = 0; - upcall.egress_tun_info = NULL; + upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -639,11 +796,18 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; - err = ovs_vport_get_egress_tun_info(vport, skb, - &info); + err = dev_fill_metadata_dst(vport->dev, skb); if (!err) - upcall.egress_tun_info = &info; + upcall.egress_tun_info = skb_tunnel_info(skb); } + + break; + } + + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; break; } @@ -654,7 +818,8 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } static int sample(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -662,9 +827,12 @@ static int sample(struct datapath *dp, struct sk_buff *skb, for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { + u32 probability; + switch (nla_type(a)) { case OVS_SAMPLE_ATTR_PROBABILITY: - if (prandom_u32() >= nla_get_u32(a)) + probability = nla_get_u32(a); + if (!probability || prandom_u32() > probability) return 0; break; @@ -688,7 +856,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a); + return output_userspace(dp, skb, key, a, actions, actions_len); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -726,7 +894,11 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); return 0; } @@ -744,12 +916,13 @@ static int execute_masked_set_action(struct sk_buff *skb, switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); flow_key->phy.skb_mark = skb->mark; break; @@ -792,6 +965,13 @@ static int execute_masked_set_action(struct sk_buff *skb, err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, __be32 *)); break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + err = -EINVAL; + break; } return err; @@ -861,7 +1041,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); if (out_skb) - do_output(dp, out_skb, prev_port); + do_output(dp, out_skb, prev_port, key); prev_port = -1; } @@ -872,7 +1052,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a); + output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: @@ -916,7 +1096,22 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a); + err = sample(dp, skb, key, a, attr, len); + break; + + case OVS_ACTION_ATTR_CT: + if (!is_flow_key_valid(key)) { + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err) + return err == -EINPROGRESS ? 0 : err; break; } @@ -927,7 +1122,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } if (prev_port != -1) - do_output(dp, skb, prev_port); + do_output(dp, skb, prev_port, key); else consume_skb(skb); @@ -969,7 +1164,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, int err; this_cpu_inc(exec_actions_level); - OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/kernel/net/openvswitch/conntrack.c b/kernel/net/openvswitch/conntrack.c new file mode 100644 index 000000000..e004067ec --- /dev/null +++ b/kernel/net/openvswitch/conntrack.c @@ -0,0 +1,790 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/openvswitch.h> +#include <net/ip.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +#include "datapath.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + size_t maxlen; + size_t minlen; +}; + +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + +/* Metadata label for masked write to conntrack label. */ +struct md_labels { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u8 commit : 1; + u16 family; + struct md_mark mark; + struct md_labels labels; +}; + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? ct->mark : 0; +#else + return 0; +#endif +} + +static void ovs_ct_get_labels(const struct nf_conn *ct, + struct ovs_key_ct_labels *labels) +{ + struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + + if (cl) { + size_t len = cl->words * sizeof(long); + + if (len > OVS_CT_LABELS_LEN) + len = OVS_CT_LABELS_LEN; + else if (len < OVS_CT_LABELS_LEN) + memset(labels, 0, OVS_CT_LABELS_LEN); + memcpy(labels, cl->bits, len); + } else { + memset(labels, 0, OVS_CT_LABELS_LEN); + } +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) +{ + key->ct.state = state; + key->ct.zone = zone->id; + key->ct.mark = ovs_ct_get_mark(ct); + ovs_ct_get_labels(ct, &key->ct.labels); +} + +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + const struct ovs_conntrack_info *info, + struct sw_flow_key *key, bool post_ct) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + if (!nf_ct_is_confirmed(ct)) + state |= OVS_CS_F_NEW; + if (ct->master) + state |= OVS_CS_F_RELATED; + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + if (info) + zone = &info->zone; + } + __ovs_ct_update_key(key, state, zone, ct); +} + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +{ + ovs_ct_update_key(skb, NULL, key, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +{ + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), + &key->ct.labels)) + return -EMSGSIZE; + + return 0; +} + +static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 new_mark; + + + /* The connection could be invalid, in which case set_mark is no-op. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + new_mark = ct_mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + + return 0; +#else + return -ENOTSUPP; +#endif +} + +static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + int err; + + /* The connection could be invalid, in which case set_label is no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + return -ENOSPC; + + err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, + OVS_CT_LABELS_LEN / sizeof(u32)); + if (err) + return err; + + ovs_ct_get_labels(ct, &key->ct.labels); + return 0; +} + +/* 'skb' should already be pulled to nh_ofs. */ +static int ovs_ct_helper(struct sk_buff *skb, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +static int handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + + if (key->eth.type == htons(ETH_P_IP)) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + int err; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + err = ip_defrag(net, skb, user); + if (err) + return err; + + ovs_cb.mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (key->eth.type == htons(ETH_P_IPV6)) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + struct sk_buff *reasm; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + reasm = nf_ct_frag6_gather(net, skb, user); + if (!reasm) + return -EINPROGRESS; + + if (skb == reasm) { + kfree_skb(skb); + return -EINVAL; + } + + /* Don't free 'skb' even though it is one of the original + * fragments, as we're going to morph it into the head. + */ + skb_get(skb); + nf_ct_frag6_consume_orig(reasm); + + key->ip.proto = ipv6_hdr(reasm)->nexthdr; + skb_morph(skb, reasm); + skb->next = reasm->next; + consume_skb(reasm); + ovs_cb.mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + key->ip.frag = OVS_FRAG_TYPE_NONE; + skb_clear_hash(skb); + skb->ignore_df = 1; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +static struct nf_conntrack_expect * +ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, + u16 proto, const struct sk_buff *skb) +{ + struct nf_conntrack_tuple tuple; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) + return NULL; + return __nf_ct_expect_find(net, zone, &tuple); +} + +/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } + + return true; +} + +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + if (!skb_nfct_cached(net, skb, info)) { + struct nf_conn *tmpl = info->ct; + + /* Associate skb with specified zone. */ + if (tmpl) { + if (skb->nfct) + nf_conntrack_put(skb->nfct); + nf_conntrack_get(&tmpl->ct_general); + skb->nfct = &tmpl->ct_general; + skb->nfctinfo = IP_CT_NEW; + } + + if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, + skb) != NF_ACCEPT) + return -ENOENT; + + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; + } + } + + ovs_ct_update_key(skb, info, key, true); + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conntrack_expect *exp; + + exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); + if (exp) { + u8 state; + + state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; + __ovs_ct_update_key(key, state, &info->zone, exp->master); + } else { + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + } + + return 0; +} + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + u8 state; + int err; + + state = key->ct.state; + if (key->ct.zone == info->zone.id && + ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { + /* Previous lookup has shown that this connection is already + * tracked and committed. Skip committing. + */ + return 0; + } + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + return 0; +} + +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) + return true; + + return false; +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull(skb, nh_ofs); + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = handle_fragments(net, key, info->zone.id, skb); + if (err) + return err; + } + + if (info->commit) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + if (err) + goto err; + + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + goto err; + } + if (labels_nonzero(&info->labels.mask)) + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); +err: + skb_push(skb, nh_ofs); + if (err) + kfree_skb(skb); + return err; +} + +static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, + const struct sw_flow_key *key, bool log) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + + helper = nf_conntrack_helper_try_module_get(name, info->family, + key->ip.proto); + if (!helper) { + OVS_NLERR(log, "Unknown helper \"%s\"", name); + return -EINVAL; + } + + help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + if (!help) { + module_put(helper->me); + return -ENOMEM; + } + + rcu_assign_pointer(help->helper, helper); + info->helper = helper; + return 0; +} + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), + .maxlen = sizeof(struct md_labels) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN } +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + const char **helper, bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen = ovs_ct_attr_lens[type].maxlen; + int minlen = ovs_ct_attr_lens[type].minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_COMMIT: + info->commit = true; + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + if (!mark->mask) { + OVS_NLERR(log, "ct_mark mask cannot be 0"); + return -EINVAL; + } + info->mark = *mark; + break; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABELS: { + struct md_labels *labels = nla_data(a); + + if (!labels_nonzero(&labels->mask)) { + OVS_NLERR(log, "ct_labels mask cannot be 0"); + return -EINVAL; + } + info->labels = *labels; + break; + } +#endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!memchr(*helper, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + const char *helper = NULL; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, &helper, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + + if (helper) { + err = ovs_ct_add_helper(&ct_info, helper, key, log); + if (err) + goto err_free_ct; + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + return 0; +err_free_ct: + __ovs_ct_free_action(&ct_info); + return err; +} + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&ct_info->labels.mask) && + nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), + &ct_info->labels)) + return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + __ovs_ct_free_action(ct_info); +} + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) +{ + if (ct_info->helper) + module_put(ct_info->helper->me); + if (ct_info->ct) + nf_ct_put(ct_info->ct); +} + +void ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/kernel/net/openvswitch/conntrack.h b/kernel/net/openvswitch/conntrack.h new file mode 100644 index 000000000..a7544f405 --- /dev/null +++ b/kernel/net/openvswitch/conntrack.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +enum ovs_key_attr; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +void ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ + OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ + OVS_CS_F_INVALID | OVS_CS_F_TRACKED) +#else +#include <linux/errno.h> + +static inline void ovs_ct_init(struct net *net) { } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + kfree_skb(skb); + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key) +{ + key->ct.state = 0; + key->ct.zone = 0; + key->ct.mark = 0; + memset(&key->ct.labels, 0, sizeof(key->ct.labels)); +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *key, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } + +#define CT_SUPPORTED_MASK 0 +#endif /* CONFIG_NF_CONNTRACK */ +#endif /* ovs_conntrack.h */ diff --git a/kernel/net/openvswitch/datapath.c b/kernel/net/openvswitch/datapath.c index 27e14962b..deadfdab1 100644 --- a/kernel/net/openvswitch/datapath.c +++ b/kernel/net/openvswitch/datapath.c @@ -91,8 +91,7 @@ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { - genl_notify(family, skb, genl_info_net(info), info->snd_portid, - 0, info->nlhdr, GFP_KERNEL); + genl_notify(family, skb, info, 0, GFP_KERNEL); } /** @@ -176,7 +175,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -188,7 +187,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -272,10 +271,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_upcall_info upcall; int error; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - upcall.egress_tun_info = NULL; + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -337,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -360,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -397,9 +393,27 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(upcall_info->actions_len); + + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); + return size; } +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + memset(skb_put(skb, plen), 0, plen); + } +} + static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) @@ -472,12 +486,33 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); - err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info); + err = ovs_nla_put_tunnel_info(user_skb, + upcall_info->egress_tun_info); BUG_ON(err); nla_nest_end(user_skb, nla); } + if (upcall_info->actions_len) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } + + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) { + if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, + upcall_info->mru)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -491,12 +526,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ - if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { - size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; - - if (plen > 0) - memset(skb_put(user_skb, plen), 0, plen); - } + pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; @@ -513,6 +543,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; @@ -521,6 +552,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; + u16 mru = 0; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -545,34 +577,40 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(eth->h_proto)) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key, log); + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); if (err) goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -584,6 +622,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!input_vport) goto err_unlock; + packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); @@ -610,6 +649,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, }; static const struct genl_ops dp_packet_genl_ops[] = { @@ -699,7 +739,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) - len += nla_total_size(acts->actions_len); + len += nla_total_size(acts->orig_len); return len + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ @@ -866,6 +906,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow *flow = NULL, *new_flow; @@ -901,7 +942,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; @@ -915,8 +956,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; /* Validate actions. */ - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - &acts, log); + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_flow; @@ -930,7 +971,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1004,7 +1045,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1016,7 +1057,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1024,7 +1065,8 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, +static struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) @@ -1034,7 +1076,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, int error; ovs_flow_mask_key(&masked_key, key, true, mask); - error = ovs_nla_copy_actions(a, &masked_key, &acts, log); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); @@ -1046,6 +1088,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key; @@ -1070,15 +1113,15 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, - log); + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, + &mask, log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; @@ -1094,7 +1137,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1129,7 +1172,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) info, OVS_FLOW_CMD_NEW, false, ufid_flags); - if (unlikely(IS_ERR(reply))) { + if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_unlock_ovs; } @@ -1143,7 +1186,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1151,7 +1194,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } @@ -1160,6 +1203,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; @@ -1174,7 +1218,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, @@ -1218,6 +1262,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; @@ -1232,8 +1277,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, - log); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); if (unlikely(err)) return err; } @@ -1786,7 +1831,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - vport->ops->get_name(vport))) + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -2189,6 +2234,7 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + ovs_ct_init(net); return 0; } @@ -2205,13 +2251,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } @@ -2226,6 +2269,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); + ovs_ct_exit(dnet); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); diff --git a/kernel/net/openvswitch/datapath.h b/kernel/net/openvswitch/datapath.h index 4ec4a480b..67bdecd9f 100644 --- a/kernel/net/openvswitch/datapath.h +++ b/kernel/net/openvswitch/datapath.h @@ -25,10 +25,11 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> +#include <net/ip_tunnels.h> +#include "conntrack.h" #include "flow.h" #include "flow_table.h" -#include "vport.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -92,14 +93,14 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @egress_tun_key: Tunnel information about this packet on egress path. - * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; + u16 mru; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -112,12 +113,16 @@ struct ovs_skb_cb { * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; u32 portid; u8 cmd; + u16 mru; }; /** @@ -128,7 +133,9 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; + + /* Module reference for configuring conntrack. */ + bool xt_label; }; extern int ovs_net_id; @@ -197,6 +204,10 @@ void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ diff --git a/kernel/net/openvswitch/dp_notify.c b/kernel/net/openvswitch/dp_notify.c index 2c631fe76..653d073ba 100644 --- a/kernel/net/openvswitch/dp_notify.c +++ b/kernel/net/openvswitch/dp_notify.c @@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/kernel/net/openvswitch/flow.c b/kernel/net/openvswitch/flow.c index 2dacc7b5a..0ea128eee 100644 --- a/kernel/net/openvswitch/flow.c +++ b/kernel/net/openvswitch/flow.c @@ -46,9 +46,11 @@ #include <net/mpls.h> #include <net/ndisc.h> +#include "conntrack.h" #include "datapath.h" #include "flow.h" #include "flow_netlink.h" +#include "vport.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) { @@ -271,8 +273,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.dst = nh->daddr; payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return -EINVAL; if (frag_off) { if (frag_off & htons(~0x7)) @@ -283,6 +283,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } + /* Delayed handling of error in ipv6_skip_exthdr() as it + * always sets frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(payload_ofs < 0)) + return -EPROTO; + nh_len = payload_ofs - nh_ofs; skb_set_transport_header(skb, nh_ofs + nh_len); key->ip.proto = nexthdr; @@ -332,7 +339,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(proto)) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -349,7 +356,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) __skb_pull(skb, sizeof(struct llc_snap_hdr)); - if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(llc->ethertype)) return llc->ethertype; return htons(ETH_P_802_2); @@ -622,12 +629,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - if (nh_len == -EINVAL) { + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + /* fall-through */ + case -EPROTO: skb->transport_header = skb->network_header; error = 0; - } else { + break; + default: error = nh_len; } return error; @@ -682,24 +693,27 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + key->tun_proto = ip_tunnel_info_af(tun_info); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); - if (tun_info->options) { + if (tun_info->options_len) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1 > sizeof(key->tun_opts)); - memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info->options, tun_info->options_len); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); key->tun_opts_len = tun_info->options_len; } else { key->tun_opts_len = 0; } } else { + key->tun_proto = 0; key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); } @@ -707,13 +721,14 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; key->recirc_id = 0; return key_extract(skb, key); } -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { @@ -722,7 +737,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key, log); + err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) return err; diff --git a/kernel/net/openvswitch/flow.h b/kernel/net/openvswitch/flow.h index a076e445c..1d055c559 100644 --- a/kernel/net/openvswitch/flow.h +++ b/kernel/net/openvswitch/flow.h @@ -32,31 +32,11 @@ #include <linux/time.h> #include <linux/flex_array.h> #include <net/inet_ecn.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,54 +46,9 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ @@ -122,12 +57,13 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ struct { @@ -176,6 +112,14 @@ struct sw_flow_key { } nd; } ipv6; }; + struct { + /* Connection tracking fields. */ + u16 zone; + u32 mark; + u8 state; + struct ovs_key_ct_labels labels; + } ct; + } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ struct sw_flow_key_range { @@ -209,6 +153,7 @@ struct sw_flow_id { struct sw_flow_actions { struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ u32 actions_len; struct nlattr actions[]; }; @@ -273,11 +218,11 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log); diff --git a/kernel/net/openvswitch/flow_netlink.c b/kernel/net/openvswitch/flow_netlink.c index c691b1a1e..d1bd4a45c 100644 --- a/kernel/net/openvswitch/flow_netlink.c +++ b/kernel/net/openvswitch/flow_netlink.c @@ -47,9 +47,9 @@ #include <net/ipv6.h> #include <net/ndisc.h> #include <net/mpls.h> +#include <net/vxlan.h> #include "flow_netlink.h" -#include "vport-vxlan.h" struct ovs_len_tbl { int len; @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -261,8 +262,8 @@ size_t ovs_tun_key_attr_size(void) * updating this function. */ return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ @@ -281,7 +282,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -290,6 +291,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -300,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -311,8 +320,11 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -339,8 +351,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -380,7 +403,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -465,29 +488,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; - int err; + struct vxlan_metadata opts; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -500,15 +544,15 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) { - struct nlattr *a; - int rem; - bool ttl = false; + bool ttl = false, ipv4 = false, ipv6 = false; __be16 tun_flags = 0; int opts_type = 0; + struct nlattr *a; + int rem; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -520,8 +564,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -534,19 +578,31 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); + ipv4 = true; break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; break; case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + SW_FLOW_KEY_PUT(match, tun_key.tos, nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + SW_FLOW_KEY_PUT(match, tun_key.ttl, nla_get_u8(a), is_mask); ttl = true; break; @@ -594,28 +650,46 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, opts_type = type; break; default: - OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", + OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); return -EINVAL; } } SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); if (rem > 0) { - OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", rem); return -EINVAL; } + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4 && !match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } if (!ttl) { - OVS_NLERR(log, "IPv4 tunnel TTL not specified."); + OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } } @@ -626,7 +700,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); @@ -640,25 +714,40 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, return 0; } -static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, - const void *tun_opts, int swkey_tun_opts_len) +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->ipv4_src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->ipv4_src)) - return -EMSGSIZE; - if (output->ipv4_dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->ipv4_dst)) - return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) @@ -675,7 +764,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; - if (tun_opts) { + if (swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_GENEVE_OPT && nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) @@ -688,9 +777,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, - const void *tun_opts, int swkey_tun_opts_len) +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) { struct nlattr *nla; int err; @@ -699,7 +789,8 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto); if (err) return err; @@ -707,17 +798,18 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, - egress_tun_info->options, - egress_tun_info->options_len); + return __ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); } -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -763,21 +855,58 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log) < 0) + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { + u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + + if (ct_state & ~CT_SUPPORTED_MASK) { + OVS_NLERR(log, "ct_state flags %08x unsupported", + ct_state); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { + const struct ovs_key_ct_labels *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); + SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); + } return 0; } -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) { int err; - err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); if (err) return err; @@ -816,7 +945,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, if (is_mask) { /* Always exact match EtherType. */ eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + } else if (!eth_proto_is_802_3(eth_type)) { OVS_NLERR(log, "EtherType %x is less than min %x", ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; @@ -1012,10 +1141,16 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } + + if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) + *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; } } @@ -1029,6 +1164,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * mask. In case the 'mask' is NULL, the flow is treated as exact match * flow. Otherwise, it is treated as a wildcarded flow, except the mask * does not include any don't care bit. + * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation @@ -1039,7 +1175,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. */ -int ovs_nla_get_match(struct sw_flow_match *match, +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, const struct nlattr *nla_key, const struct nlattr *nla_mask, bool log) @@ -1089,7 +1225,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) return err; @@ -1116,7 +1252,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.ipv4_dst) + if (match->key->tun_proto) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1169,7 +1305,8 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); if (err) goto free_newmask; } @@ -1250,7 +1387,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(const struct nlattr *attr, +int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, struct sw_flow_key *key, bool log) { @@ -1266,9 +1403,10 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, memset(&match, 0, sizeof(match)); match.key = key; + memset(&key->ct, 0, sizeof(key->ct)); key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false, log); + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } static int __ovs_nla_put_key(const struct sw_flow_key *swkey, @@ -1287,14 +1425,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask)) { + if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); - if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len)) + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto)) goto nla_put_failure; } @@ -1314,6 +1452,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; + if (ovs_ct_put_key(output, skb)) + goto nla_put_failure; + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; @@ -1548,11 +1689,51 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) { - kfree_rcu(sf_acts, rcu); + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1582,6 +1763,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; kfree(*sfa); *sfa = acts; @@ -1609,8 +1791,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, return a; } -static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len, bool log) +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) { struct nlattr *a; @@ -1625,7 +1807,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa, int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0, log); + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1641,12 +1823,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); -static int validate_and_copy_sample(const struct nlattr *attr, +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -1678,15 +1860,15 @@ static int validate_and_copy_sample(const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); if (err) return err; st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; - err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, eth_type, vlan_tci, log); if (err) return err; @@ -1746,12 +1928,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); - opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -1771,27 +1955,33 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); return PTR_ERR(a); + } - tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; - tun_info->options_len = key.tun_opts_len; + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; - if (tun_info->options_len) { - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - memcpy((tun_info + 1), - TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); - tun_info->options = (tun_info + 1); - } else { - tun_info->options = NULL; - } + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; + tun_info->key = key.tun_key; + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); add_nested_action_end(*sfa, start); return err; @@ -1829,8 +2019,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) @@ -1843,6 +2032,8 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_ETHERNET: break; @@ -2008,7 +2199,7 @@ static int copy_action(const struct nlattr *from, return 0; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -2032,7 +2223,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2139,13 +2331,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa, + err = validate_and_copy_sample(net, a, key, depth, sfa, eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -2164,7 +2363,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, } /* 'key' must be the masked key. */ -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { @@ -2174,10 +2373,11 @@ int ovs_nla_copy_actions(const struct nlattr *attr, if (IS_ERR(*sfa)) return PTR_ERR(*sfa); - err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + (*sfa)->orig_len = nla_len(attr); + err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2227,16 +2427,17 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - tun_info->options_len ? - tun_info->options : NULL, - tun_info->options_len); + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); @@ -2298,6 +2499,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/kernel/net/openvswitch/flow_netlink.h b/kernel/net/openvswitch/flow_netlink.h index 5c3d75bff..47dd142ec 100644 --- a/kernel/net/openvswitch/flow_netlink.h +++ b/kernel/net/openvswitch/flow_netlink.h @@ -45,29 +45,34 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, - bool log); +int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, + struct sw_flow_key *, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, - const struct nlattr *mask, bool log); -int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, const struct sw_flow_key *key, bool log); u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/kernel/net/openvswitch/flow_table.c b/kernel/net/openvswitch/flow_table.c index aa349514e..d073fff82 100644 --- a/kernel/net/openvswitch/flow_table.c +++ b/kernel/net/openvswitch/flow_table.c @@ -18,6 +18,7 @@ #include "flow.h" #include "datapath.h" +#include "flow_netlink.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> @@ -92,7 +93,8 @@ struct sw_flow *ovs_flow_alloc(void) /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_KERNEL | __GFP_ZERO, 0); + GFP_KERNEL | __GFP_ZERO, + node_online(0) ? 0 : NUMA_NO_NODE); if (!stats) goto err; @@ -144,7 +146,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree((struct sw_flow_actions __force *)flow->sf_acts); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, @@ -425,7 +428,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.ipv4_dst) + if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), @@ -753,7 +756,7 @@ int ovs_flow_init(void) BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (num_possible_nodes() + + (nr_node_ids * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) diff --git a/kernel/net/openvswitch/vport-geneve.c b/kernel/net/openvswitch/vport-geneve.c index bf02fd580..e41cd12d9 100644 --- a/kernel/net/openvswitch/vport-geneve.c +++ b/kernel/net/openvswitch/vport-geneve.c @@ -26,113 +26,42 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_geneve_vport_ops; - /** * struct geneve_port - Keeps track of open UDP ports - * @gs: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct geneve_port { - struct geneve_sock *gs; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(geneve_ports); - static inline struct geneve_port *geneve_vport(const struct vport *vport) { return vport_priv(vport); } -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) -{ - struct vport *vport = gs->rcv_data; - struct genevehdr *geneveh = geneve_hdr(skb); - int opts_len; - struct ovs_tunnel_info tun_info; - __be64 key; - __be16 flags; - - opts_len = geneveh->opt_len * 4; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | - (geneveh->oam ? TUNNEL_OAM : 0) | - (geneveh->critical ? TUNNEL_CRIT_OPT : 0); - - key = vni_to_tunnel_id(geneveh->vni); - - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); - - ovs_vport_receive(vport, skb, &tun_info); -} - static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) return -EMSGSIZE; return 0; } -static void geneve_tnl_destroy(struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - geneve_sock_release(geneve_port->gs); - - ovs_vport_deferred_free(vport); -} - static struct vport *geneve_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct geneve_port *geneve_port; - struct geneve_sock *gs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -154,106 +83,40 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - strncpy(geneve_port->name, parms->name, IFNAMSIZ); + geneve_port->port_no = dst_port; - gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); - if (IS_ERR(gs)) { + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)gs; + return ERR_CAST(dev); } - geneve_port->gs = gs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +static struct vport *geneve_create(const struct vport_parms *parms) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; - struct net *net = ovs_dp_get_net(vport->dp); - struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport; - struct rtable *rt; - struct flowi4 fl; - u8 vni[3], opts_len, *opts; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - tunnel_id_to_vni(tun_key->tun_id, vni); - skb->ignore_df = 1; - - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { - opts = (u8 *)tun_info->options; - opts_len = tun_info->options_len; - } else { - opts = NULL; - opts_len = 0; - } - - err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->ipv4_dst, tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, sport, dport, - tun_key->tun_flags, vni, opts_len, opts, - !!(tun_key->tun_flags & TUNNEL_CSUM), false); - if (err < 0) - ip_rt_put(rt); - return err; - -error: - kfree_skb(skb); - return err; -} - -static const char *geneve_get_name(const struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - return geneve_port->name; -} + struct vport *vport; -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to geneve_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_tnl_create, - .destroy = geneve_tnl_destroy, - .get_name = geneve_get_name, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, - .send = geneve_tnl_send, - .owner = THIS_MODULE, - .get_egress_tun_info = geneve_get_egress_tun_info, + .send = dev_queue_xmit, }; static int __init ovs_geneve_tnl_init(void) diff --git a/kernel/net/openvswitch/vport-gre.c b/kernel/net/openvswitch/vport-gre.c index f17ac9642..7f8897f33 100644 --- a/kernel/net/openvswitch/vport-gre.c +++ b/kernel/net/openvswitch/vport-gre.c @@ -45,254 +45,50 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_gre_vport_ops; -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 be64_get_low32(__be64 x) +static struct vport *gre_tnl_create(const struct vport_parms *parms) { -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - -static __be16 filter_tnl_flags(__be16 flags) -{ - return flags & (TUNNEL_CSUM | TUNNEL_KEY); -} - -static struct sk_buff *__build_header(struct sk_buff *skb, - int tunnel_hlen) -{ - struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) - return skb; - - tpi.flags = filter_tnl_flags(tun_key->tun_flags); - tpi.proto = htons(ETH_P_TEB); - tpi.key = be64_get_low32(tun_key->tun_id); - tpi.seq = 0; - gre_build_header(skb, &tpi, tunnel_hlen); - - return skb; -} - -static __be64 key_to_tunnel_id(__be32 key, __be32 seq) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u64)seq << 32 | (__force u32)key); -#else - return (__force __be64)((__force u64)key << 32 | (__force u32)seq); -#endif -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_rcv(struct sk_buff *skb, - const struct tnl_ptk_info *tpi) -{ - struct ovs_tunnel_info tun_info; - struct ovs_net *ovs_net; - struct vport *vport; - __be64 key; - - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - if (unlikely(!vport)) - return PACKET_REJECT; - - key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); - - ovs_vport_receive(vport, skb, &tun_info); - return PACKET_RCVD; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) -{ - struct ovs_net *ovs_net; + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; struct vport *vport; - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - - if (unlikely(!vport)) - return PACKET_REJECT; - else - return PACKET_RCVD; -} - -static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - const struct ovs_key_ipv4_tunnel *tun_key; - struct flowi4 fl; - struct rtable *rt; - int min_headroom; - int tunnel_hlen; - __be16 df; - int err; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto err_free_skb; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto err_free_skb; - } - - tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) { - err = -ENOMEM; - goto err_free_rt; - } - - /* Push Tunnel header. */ - skb = __build_header(skb, tunnel_hlen); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); } - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - tun_key->ipv4_dst, IPPROTO_GRE, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - return err; -} - -static struct gre_cisco_protocol gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .priority = 1, -}; - -static int gre_ports; -static int gre_init(void) -{ - int err; - - gre_ports++; - if (gre_ports > 1) - return 0; - - err = gre_cisco_register(&gre_protocol); - if (err) - pr_warn("cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - gre_ports--; - if (gre_ports > 0) - return; - - gre_cisco_unregister(&gre_protocol); -} + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); -static const char *gre_get_name(const struct vport *vport) -{ - return vport_priv(vport); + return vport; } static struct vport *gre_create(const struct vport_parms *parms) { - struct net *net = ovs_dp_get_net(parms->dp); - struct ovs_net *ovs_net; struct vport *vport; - int err; - err = gre_init(); - if (err) - return ERR_PTR(err); - - ovs_net = net_generic(net, ovs_net_id); - if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { - vport = ERR_PTR(-EEXIST); - goto error; - } - - vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + vport = gre_tnl_create(parms); if (IS_ERR(vport)) - goto error; - - strncpy(vport_priv(vport), parms->name, IFNAMSIZ); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); - return vport; + return vport; -error: - gre_exit(); - return vport; -} - -static void gre_tnl_destroy(struct vport *vport) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_net *ovs_net; - - ovs_net = net_generic(net, ovs_net_id); - - RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); - ovs_vport_deferred_free(vport); - gre_exit(); -} - -static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_GRE, skb->mark, 0, 0); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, - .destroy = gre_tnl_destroy, - .get_name = gre_get_name, - .send = gre_tnl_send, - .get_egress_tun_info = gre_get_egress_tun_info, - .owner = THIS_MODULE, + .send = dev_queue_xmit, + .destroy = ovs_netdev_tunnel_destroy, }; static int __init ovs_gre_tnl_init(void) diff --git a/kernel/net/openvswitch/vport-internal_dev.c b/kernel/net/openvswitch/vport-internal_dev.c index 6a55f7105..ec76398a7 100644 --- a/kernel/net/openvswitch/vport-internal_dev.c +++ b/kernel/net/openvswitch/vport-internal_dev.c @@ -43,35 +43,26 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + len = skb->len; rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); + + if (likely(!err)) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + netdev->stats.tx_errors++; + } return 0; } @@ -115,13 +106,45 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } +static struct rtnl_link_stats64 * +internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } + + return stats; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, - .ndo_get_stats64 = internal_dev_get_stats, + .ndo_get_stats64 = internal_get_stats, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -135,7 +158,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -156,49 +179,51 @@ static void do_setup(struct net_device *netdev) static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, - do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } + vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!vport->dev->tstats) { + err = -ENOMEM; + goto error_free_netdev; + } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) - goto error_free_netdev; + goto error_unlock; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; -error_free_netdev: +error_unlock: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_percpu(vport->dev->tstats); +error_free_netdev: + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -207,49 +232,49 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); - + unregister_netdevice(vport->dev); + free_percpu(vport->dev->tstats); rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static netdev_tx_t internal_dev_recv(struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; - int len; + struct net_device *netdev = skb->dev; + struct pcpu_sw_netstats *stats; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); - return 0; + netdev->stats.rx_dropped++; + return NETDEV_TX_OK; } - len = skb->len; - skb_dst_drop(skb); nf_reset(skb); secpath_reset(skb); - skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - netif_rx(skb); + stats = this_cpu_ptr(netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); - return len; + netif_rx(skb); + return NETDEV_TX_OK; } static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/kernel/net/openvswitch/vport-netdev.c b/kernel/net/openvswitch/vport-netdev.c index 33e6d6e29..6b0190b98 100644 --- a/kernel/net/openvswitch/vport-netdev.c +++ b/kernel/net/openvswitch/vport-netdev.c @@ -26,18 +26,24 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> +#include <linux/export.h> -#include <net/llc.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +static void netdev_port_receive(struct sk_buff *skb) { + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; @@ -53,10 +59,8 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - - ovs_vport_receive(vport, skb, NULL); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; - error: kfree_skb(skb); } @@ -65,15 +69,11 @@ error: static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } @@ -83,139 +83,116 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +struct vport *ovs_netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_netdev_link); -static void free_port_rcu(struct rcu_head *rcu) +static struct vport *netdev_create(const struct vport_parms *parms) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + return ovs_netdev_link(vport, parms->name); } -void ovs_netdev_detach_dev(struct vport *vport) +static void vport_netdev_free(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + struct vport *vport = container_of(rcu, struct vport, rcu); + if (vport->dev) + dev_put(vport->dev); + ovs_vport_free(vport); +} + +void ovs_netdev_detach_dev(struct vport *vport) +{ ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } +EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); -} - -const char *ovs_netdev_get_name(const struct vport *vport) -{ - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; -} - -static unsigned int packet_length(const struct sk_buff *skb) -{ - unsigned int length = skb->len - ETH_HLEN; - - if (skb->protocol == htons(ETH_P_8021Q)) - length -= VLAN_HLEN; - - return length; + call_rcu(&vport->rcu, vport_netdev_free); } -static int netdev_send(struct vport *vport, struct sk_buff *skb) +void ovs_netdev_tunnel_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; - int len; - - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, - packet_length(skb), mtu); - goto drop; - } - - skb->dev = netdev_vport->dev; - len = skb->len; - dev_queue_xmit(skb); + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); - return len; + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev); + dev_put(vport->dev); + vport->dev = NULL; + rtnl_unlock(); -drop: - kfree_skb(skb); - return 0; + call_rcu(&vport->rcu, vport_netdev_free); } +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) @@ -231,8 +208,7 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, - .send = netdev_send, + .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) diff --git a/kernel/net/openvswitch/vport-netdev.h b/kernel/net/openvswitch/vport-netdev.h index 6f7038e79..19e29c12a 100644 --- a/kernel/net/openvswitch/vport-netdev.h +++ b/kernel/net/openvswitch/vport-netdev.h @@ -26,22 +26,11 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -const char *ovs_netdev_get_name(const struct vport *); +struct vport *ovs_netdev_link(struct vport *vport, const char *name); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); void ovs_netdev_exit(void); +void ovs_netdev_tunnel_destroy(struct vport *vport); #endif /* vport_netdev.h */ diff --git a/kernel/net/openvswitch/vport-vxlan.c b/kernel/net/openvswitch/vport-vxlan.c index 6d39766e7..d933cb89e 100644 --- a/kernel/net/openvswitch/vport-vxlan.c +++ b/kernel/net/openvswitch/vport-vxlan.c @@ -17,94 +17,37 @@ * 02110-1301, USA */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> #include <linux/module.h> - -#include <net/icmp.h> -#include <net/ip.h> #include <net/udp.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> #include <net/vxlan.h> #include "datapath.h" #include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in <net/vxlan.h> */ -}; - -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} +#include "vport-netdev.h" -/* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ovs_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} +static struct vport_ops ovs_vxlan_netdev_vport_ops; static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) { - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) return -EMSGSIZE; - if (vxlan_port->exts) { + if (vxlan->flags & VXLAN_F_GBP) { struct nlattr *exts; exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); if (!exts) return -EMSGSIZE; - if (vxlan_port->exts & VXLAN_F_GBP && + if (vxlan->flags & VXLAN_F_GBP && nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) return -EMSGSIZE; @@ -114,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) return 0; } -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, }; -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) { - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; int err; if (nla_len(attr) < sizeof(struct nlattr)) @@ -140,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) if (err < 0) return err; - vxlan_port = vxlan_vport(vport); - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; + conf->flags |= VXLAN_F_GBP; return 0; } @@ -152,166 +84,84 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - u16 dst_port; int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + }; if (!options) { err = -EINVAL; goto error; } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); + conf.dst_port = htons(nla_get_u16(a)); } else { /* Require destination port from userspace. */ err = -EINVAL; goto error; } - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); if (a) { - err = vxlan_configure_exts(vport, a); + err = vxlan_configure_exts(vport, a, &conf); if (err) { ovs_vport_free(vport); goto error; } } - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)vs; + return ERR_CAST(dev); } - vxlan_port->vs = vs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; - error: return ERR_PTR(err); } -static int vxlan_ext_gbp(struct sk_buff *skb) +static struct vport *vxlan_create(const struct vport_parms *parms) { - const struct ovs_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - struct sock *sk = vxlan_port->vs->sock->sk; - __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; - struct vxlan_metadata md = {0}; - struct rtable *rt; - struct flowi4 fl; - __be16 src_port; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); + struct vport *vport; - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; + return ovs_netdev_link(vport, parms->name); } -static struct vport_ops ovs_vxlan_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, - .get_options = vxlan_get_options, - .send = vxlan_tnl_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = vxlan_get_options, + .send = dev_queue_xmit, }; static int __init ovs_vxlan_tnl_init(void) { - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); } static void __exit ovs_vxlan_tnl_exit(void) { - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); } module_init(ovs_vxlan_tnl_init); diff --git a/kernel/net/openvswitch/vport-vxlan.h b/kernel/net/openvswitch/vport-vxlan.h deleted file mode 100644 index 4b08233e7..000000000 --- a/kernel/net/openvswitch/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include <linux/kernel.h> -#include <linux/types.h> - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif diff --git a/kernel/net/openvswitch/vport.c b/kernel/net/openvswitch/vport.c index 067a3fff1..31cbc8c5c 100644 --- a/kernel/net/openvswitch/vport.c +++ b/kernel/net/openvswitch/vport.c @@ -34,9 +34,6 @@ #include "vport.h" #include "vport-internal_dev.h" -static void ovs_vport_record_error(struct vport *, - enum vport_err_type err_type); - static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -74,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name) return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } -int ovs_vport_ops_register(struct vport_ops *ops) +int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; @@ -90,7 +87,7 @@ errout: ovs_unlock(); return err; } -EXPORT_SYMBOL_GPL(ovs_vport_ops_register); +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { @@ -113,7 +110,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -157,12 +154,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-EINVAL); } - vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); - } - return vport; } EXPORT_SYMBOL_GPL(ovs_vport_alloc); @@ -183,7 +174,6 @@ void ovs_vport_free(struct vport *vport) * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); - free_percpu(vport->percpu_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); @@ -226,7 +216,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } @@ -266,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * * @vport: vport to delete. * - * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. ovs_mutex must be held. + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. */ void ovs_vport_del(struct vport *vport) { @@ -290,41 +280,19 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { - int i; - - memset(stats, 0, sizeof(*stats)); - - /* We potentially have 2 sources of stats that need to be combined: - * those we have collected (split into err_stats and percpu_stats) from - * set_stats() and device error stats from netdev->get_stats() (for - * errors that happen downstream and therefore aren't reported through - * our vport_record_error() function). - * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). - * netdev-stats can be directly read over netlink-ioctl. - */ - - stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); - stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); - stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); - stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; } /** @@ -468,94 +436,34 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) { - struct pcpu_sw_netstats *stats; struct sw_flow_key key; int error; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len + - (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_info = NULL; + OVS_CB(skb)->mru = 0; + if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { + u32 mark; + + mark = skb->mark; + skb_scrub_packet(skb, true); + skb->mark = mark; + tun_info = NULL; + } + /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } ovs_dp_process_packet(skb, &key); + return 0; } EXPORT_SYMBOL_GPL(ovs_vport_receive); -/** - * ovs_vport_send - send a packet on a device - * - * @vport: vport on which to send the packet - * @skb: skb to send - * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. - */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) -{ - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_sw_netstats *stats; - - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - } else { - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - } - return sent; -} - -/** - * ovs_vport_record_error - indicate device error to generic stats layer - * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type - * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. - */ -static void ovs_vport_record_error(struct vport *vport, - enum vport_err_type err_type) -{ - switch (err_type) { - case VPORT_E_RX_DROPPED: - atomic_long_inc(&vport->err_stats.rx_dropped); - break; - - case VPORT_E_RX_ERROR: - atomic_long_inc(&vport->err_stats.rx_errors); - break; - - case VPORT_E_TX_DROPPED: - atomic_long_inc(&vport->err_stats.tx_dropped); - break; - - case VPORT_E_TX_ERROR: - atomic_long_inc(&vport->err_stats.tx_errors); - break; - } - -} - static void free_vport_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -572,56 +480,32 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, - struct net *net, - const struct ovs_tunnel_info *tun_info, - u8 ipproto, - u32 skb_mark, - __be16 tp_src, - __be16 tp_dst) +static unsigned int packet_length(const struct sk_buff *skb) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct rtable *rt; - struct flowi4 fl; - - if (unlikely(!tun_info)) - return -EINVAL; - - tun_key = &tun_info->tunnel; - - /* Route lookup to get srouce IP address. - * The process may need to be changed if the corresponding process - * in vports ops changed. - */ - rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); - if (IS_ERR(rt)) - return PTR_ERR(rt); + unsigned int length = skb->len - ETH_HLEN; - ip_rt_put(rt); + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; - /* Generate egress_tun_info based on tun_info, - * saddr, tp_src and tp_dst - */ - __ovs_flow_tun_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); - - return 0; + return length; } -EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) +void ovs_vport_send(struct vport *vport, struct sk_buff *skb) { - /* get_egress_tun_info() is only implemented on tunnel ports. */ - if (unlikely(!vport->ops->get_egress_tun_info)) - return -EINVAL; + int mtu = vport->dev->mtu; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + vport->dev->name, + packet_length(skb), mtu); + vport->dev->stats.tx_errors++; + goto drop; + } + + skb->dev = vport->dev; + vport->ops->send(skb); + return; - return vport->ops->get_egress_tun_info(vport, skb, info); +drop: + kfree_skb(skb); } diff --git a/kernel/net/openvswitch/vport.h b/kernel/net/openvswitch/vport.h index bc85331a6..8ea3a9698 100644 --- a/kernel/net/openvswitch/vport.h +++ b/kernel/net/openvswitch/vport.h @@ -35,10 +35,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; - int ovs_vport_init(void); void ovs_vport_exit(void); @@ -56,26 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); - -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, - struct net *net, - const struct ovs_tunnel_info *tun_info, - u8 ipproto, - u32 skb_mark, - __be16 tp_src, - __be16 tp_dst); -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); - -/* The following definitions are for implementers of vport devices: */ - -struct vport_err_stats { - atomic_long_t rx_dropped; - atomic_long_t rx_errors; - atomic_long_t tx_dropped; - atomic_long_t tx_errors; -}; /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. @@ -101,12 +77,10 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @err_stats: Points to error statistics used and maintained by vport * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -115,10 +89,8 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_sw_netstats __percpu *percpu_stats; - - struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** @@ -155,11 +127,8 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. - * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for - * a packet. */ struct vport_ops { enum ovs_vport_type type; @@ -171,24 +140,11 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - - int (*send)(struct vport *, struct sk_buff *); - int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); - + netdev_tx_t (*send) (struct sk_buff *skb); struct module *owner; struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, -}; - struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); @@ -225,8 +181,8 @@ static inline struct vport *vport_from_priv(void *priv) return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -235,11 +191,22 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } -int ovs_vport_ops_register(struct vport_ops *ops); +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev->name; +} + +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) + void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ovs_key_ipv4_tunnel *key, + const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) @@ -247,13 +214,16 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, struct rtable *rt; memset(fl, 0, sizeof(*fl)); - fl->daddr = key->ipv4_dst; - fl->saddr = key->ipv4_src; - fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; rt = ip_route_output_key(net, fl); return rt; } + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb); + #endif /* vport.h */ |