summaryrefslogtreecommitdiffstats
path: root/kernel/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/net/ipv4')
-rw-r--r--kernel/net/ipv4/Kconfig34
-rw-r--r--kernel/net/ipv4/Makefile3
-rw-r--r--kernel/net/ipv4/af_inet.c89
-rw-r--r--kernel/net/ipv4/ah4.c4
-rw-r--r--kernel/net/ipv4/arp.c147
-rw-r--r--kernel/net/ipv4/datagram.c2
-rw-r--r--kernel/net/ipv4/devinet.c25
-rw-r--r--kernel/net/ipv4/esp4.c201
-rw-r--r--kernel/net/ipv4/fib_frontend.c130
-rw-r--r--kernel/net/ipv4/fib_lookup.h1
-rw-r--r--kernel/net/ipv4/fib_rules.c6
-rw-r--r--kernel/net/ipv4/fib_semantics.c520
-rw-r--r--kernel/net/ipv4/fib_trie.c90
-rw-r--r--kernel/net/ipv4/fou.c35
-rw-r--r--kernel/net/ipv4/geneve.c453
-rw-r--r--kernel/net/ipv4/gre_demux.c235
-rw-r--r--kernel/net/ipv4/gre_offload.c3
-rw-r--r--kernel/net/ipv4/icmp.c32
-rw-r--r--kernel/net/ipv4/igmp.c214
-rw-r--r--kernel/net/ipv4/inet_connection_sock.c297
-rw-r--r--kernel/net/ipv4/inet_diag.c152
-rw-r--r--kernel/net/ipv4/inet_fragment.c46
-rw-r--r--kernel/net/ipv4/inet_hashtables.c137
-rw-r--r--kernel/net/ipv4/inet_timewait_sock.c73
-rw-r--r--kernel/net/ipv4/inetpeer.c20
-rw-r--r--kernel/net/ipv4/ip_forward.c37
-rw-r--r--kernel/net/ipv4/ip_fragment.c120
-rw-r--r--kernel/net/ipv4/ip_gre.c464
-rw-r--r--kernel/net/ipv4/ip_input.c50
-rw-r--r--kernel/net/ipv4/ip_output.c204
-rw-r--r--kernel/net/ipv4/ip_sockglue.c54
-rw-r--r--kernel/net/ipv4/ip_tunnel.c37
-rw-r--r--kernel/net/ipv4/ip_tunnel_core.c258
-rw-r--r--kernel/net/ipv4/ip_vti.c2
-rw-r--r--kernel/net/ipv4/ipconfig.c34
-rw-r--r--kernel/net/ipv4/ipip.c8
-rw-r--r--kernel/net/ipv4/ipmr.c39
-rw-r--r--kernel/net/ipv4/netfilter.c16
-rw-r--r--kernel/net/ipv4/netfilter/Kconfig17
-rw-r--r--kernel/net/ipv4/netfilter/Makefile3
-rw-r--r--kernel/net/ipv4/netfilter/arp_tables.c131
-rw-r--r--kernel/net/ipv4/netfilter/arptable_filter.c7
-rw-r--r--kernel/net/ipv4/netfilter/ip_tables.c170
-rw-r--r--kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c17
-rw-r--r--kernel/net/ipv4/netfilter/ipt_ECN.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_SYNPROXY.c35
-rw-r--r--kernel/net/ipv4/netfilter/ipt_ah.c2
-rw-r--r--kernel/net/ipv4/netfilter/ipt_rpfilter.c11
-rw-r--r--kernel/net/ipv4/netfilter/iptable_filter.c9
-rw-r--r--kernel/net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--kernel/net/ipv4/netfilter/iptable_nat.c26
-rw-r--r--kernel/net/ipv4/netfilter/iptable_raw.c9
-rw-r--r--kernel/net/ipv4/netfilter/iptable_security.c12
-rw-r--r--kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c20
-rw-r--r--kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c8
-rw-r--r--kernel/net/ipv4/netfilter/nf_defrag_ipv4.c42
-rw-r--r--kernel/net/ipv4/netfilter/nf_dup_ipv4.c106
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c48
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--kernel/net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--kernel/net/ipv4/netfilter/nf_tables_arp.c6
-rw-r--r--kernel/net/ipv4/netfilter/nf_tables_ipv4.c10
-rw-r--r--kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c22
-rw-r--r--kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c8
-rw-r--r--kernel/net/ipv4/netfilter/nft_dup_ipv4.c110
-rw-r--r--kernel/net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--kernel/net/ipv4/netfilter/nft_redir_ipv4.c2
-rw-r--r--kernel/net/ipv4/netfilter/nft_reject_ipv4.c5
-rw-r--r--kernel/net/ipv4/ping.c7
-rw-r--r--kernel/net/ipv4/proc.c4
-rw-r--r--kernel/net/ipv4/raw.c28
-rw-r--r--kernel/net/ipv4/route.c354
-rw-r--r--kernel/net/ipv4/syncookies.c33
-rw-r--r--kernel/net/ipv4/sysctl_net_ipv4.c59
-rw-r--r--kernel/net/ipv4/tcp.c189
-rw-r--r--kernel/net/ipv4/tcp_bic.c2
-rw-r--r--kernel/net/ipv4/tcp_cdg.c433
-rw-r--r--kernel/net/ipv4/tcp_cong.c27
-rw-r--r--kernel/net/ipv4/tcp_cubic.c26
-rw-r--r--kernel/net/ipv4/tcp_dctcp.c26
-rw-r--r--kernel/net/ipv4/tcp_diag.c8
-rw-r--r--kernel/net/ipv4/tcp_fastopen.c75
-rw-r--r--kernel/net/ipv4/tcp_highspeed.c2
-rw-r--r--kernel/net/ipv4/tcp_htcp.c2
-rw-r--r--kernel/net/ipv4/tcp_hybla.c2
-rw-r--r--kernel/net/ipv4/tcp_illinois.c2
-rw-r--r--kernel/net/ipv4/tcp_input.c532
-rw-r--r--kernel/net/ipv4/tcp_ipv4.c273
-rw-r--r--kernel/net/ipv4/tcp_metrics.c83
-rw-r--r--kernel/net/ipv4/tcp_minisocks.c88
-rw-r--r--kernel/net/ipv4/tcp_offload.c4
-rw-r--r--kernel/net/ipv4/tcp_output.c278
-rw-r--r--kernel/net/ipv4/tcp_recovery.c109
-rw-r--r--kernel/net/ipv4/tcp_scalable.c2
-rw-r--r--kernel/net/ipv4/tcp_timer.c25
-rw-r--r--kernel/net/ipv4/tcp_vegas.c6
-rw-r--r--kernel/net/ipv4/tcp_veno.c2
-rw-r--r--kernel/net/ipv4/tcp_yeah.c2
-rw-r--r--kernel/net/ipv4/udp.c21
-rw-r--r--kernel/net/ipv4/udp_diag.c2
-rw-r--r--kernel/net/ipv4/udp_tunnel.c33
-rw-r--r--kernel/net/ipv4/xfrm4_input.c7
-rw-r--r--kernel/net/ipv4/xfrm4_output.c13
-rw-r--r--kernel/net/ipv4/xfrm4_policy.c113
107 files changed, 4964 insertions, 3053 deletions
diff --git a/kernel/net/ipv4/Kconfig b/kernel/net/ipv4/Kconfig
index bd2901604..416dfa004 100644
--- a/kernel/net/ipv4/Kconfig
+++ b/kernel/net/ipv4/Kconfig
@@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS
When this option is enabled IP tunnels can be configured to use
FOU or GUE encapsulation.
-config GENEVE
- tristate "Generic Network Virtualization Encapsulation (Geneve)"
- depends on INET
- select NET_UDP_TUNNEL
- ---help---
- This allows one to create Geneve virtual interfaces that provide
- Layer 2 Networks over Layer 3 Networks. Geneve is often used
- to tunnel virtual network infrastructure in virtualized environments.
- For more information see:
- http://tools.ietf.org/html/draft-gross-geneve-01
-
- To compile this driver as a module, choose M here: the module
-
-
config INET_AH
tristate "IP: AH transformation"
select XFRM_ALGO
@@ -615,6 +601,22 @@ config TCP_CONG_DCTCP
For further details see:
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+config TCP_CONG_CDG
+ tristate "CAIA Delay-Gradient (CDG)"
+ default n
+ ---help---
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
+
+ o Use the delay gradient as a congestion signal.
+ o Back off with an average probability that is independent of the RTT.
+ o Coexist with flows that use loss-based congestion control.
+ o Tolerate packet loss unrelated to congestion.
+
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -646,6 +648,9 @@ choice
config DEFAULT_DCTCP
bool "DCTCP" if TCP_CONG_DCTCP=y
+ config DEFAULT_CDG
+ bool "CDG" if TCP_CONG_CDG=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
@@ -668,6 +673,7 @@ config DEFAULT_TCP_CONG
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
+ default "cdg" if DEFAULT_CDG
default "cubic"
config TCP_MD5SIG
diff --git a/kernel/net/ipv4/Makefile b/kernel/net/ipv4/Makefile
index 518c04ed6..c29809f76 100644
--- a/kernel/net/ipv4/Makefile
+++ b/kernel/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -42,6 +43,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
@@ -56,7 +58,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
-obj-$(CONFIG_GENEVE) += geneve.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
diff --git a/kernel/net/ipv4/af_inet.c b/kernel/net/ipv4/af_inet.c
index a5aa54ea6..5c5db6636 100644
--- a/kernel/net/ipv4/af_inet.c
+++ b/kernel/net/ipv4/af_inet.c
@@ -112,12 +112,14 @@
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
+#include <net/l3mdev.h>
/* The inetsw table contains everything that inet_create needs to
@@ -217,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog)
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
- !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
- err = fastopen_init_queue(sk, backlog);
+ fastopen_queue_tune(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
- err = fastopen_init_queue(sk,
+ fastopen_queue_tune(sk,
((uint)sysctl_tcp_fastopen) >> 16);
- else
- err = 0;
- if (err)
- goto out;
tcp_fastopen_init_key_once(true);
}
@@ -259,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -319,7 +320,7 @@ lookup_protocol:
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
- sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
@@ -426,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
+ u32 tb_id = RT_TABLE_LOCAL;
int err;
/* If the socket has its own bind function then use it. (RAW) */
@@ -447,7 +449,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -490,7 +493,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (sk->sk_prot->get_port(sk, snum)) {
+ if ((snum || !inet->bind_address_no_port) &&
+ sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
@@ -1038,22 +1042,16 @@ void inet_register_protosw(struct inet_protosw *p)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
- answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
-
/* Check only the non-wild match. */
- if (INET_PROTOSW_PERMANENT & answer->flags) {
- if (protocol == answer->protocol)
- break;
- last_perm = lh;
- }
-
- answer = NULL;
+ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
+ break;
+ if (protocol == answer->protocol)
+ goto out_permanent;
+ last_perm = lh;
}
- if (answer)
- goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
@@ -1432,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
struct net *net)
{
struct socket *sock;
- int rc = sock_create_kern(family, type, protocol, &sock);
+ int rc = sock_create_kern(net, family, type, protocol, &sock);
if (rc == 0) {
*sk = sock->sk;
@@ -1442,45 +1440,56 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
* we do not wish this socket to see incoming packets.
*/
(*sk)->sk_prot->unhash(*sk);
-
- sk_change_net(*sk, net);
}
return rc;
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
+{
+ return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field);
+
unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
int i;
for_each_possible_cpu(i)
- res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
+ res += snmp_get_cpu_field(mib, i, offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+ size_t syncp_offset)
+{
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *)bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
+
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
for_each_possible_cpu(cpu) {
- void *bhptr;
- struct u64_stats_sync *syncp;
- u64 v;
- unsigned int start;
-
- bhptr = per_cpu_ptr(mib, cpu);
- syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
- do {
- start = u64_stats_fetch_begin_irq(syncp);
- v = *(((u64 *) bhptr) + offt);
- } while (u64_stats_fetch_retry_irq(syncp, start));
-
- res += v;
+ res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
}
return res;
}
@@ -1599,7 +1608,7 @@ static __net_init int inet_init_net(struct net *net)
*/
seqlock_init(&net->ipv4.ip_local_ports.lock);
net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 61000;
+ net->ipv4.ip_local_ports.range[1] = 60999;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
@@ -1781,6 +1790,8 @@ static int __init inet_init(void)
dev_add_pack(&ip_packet_type);
+ ip_tunnel_core_init();
+
rc = 0;
out:
return rc;
diff --git a/kernel/net/ipv4/ah4.c b/kernel/net/ipv4/ah4.c
index ac9a32ec3..f2a71025a 100644
--- a/kernel/net/ipv4/ah4.c
+++ b/kernel/net/ipv4/ah4.c
@@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
ahp->icv_trunc_len + seqhi_len);
- if (!work_iph)
+ if (!work_iph) {
+ err = -ENOMEM;
goto out;
+ }
seqhi = (__be32 *)((char *)work_iph + ihl);
auth_data = ah_tmp_auth(seqhi, seqhi_len);
diff --git a/kernel/net/ipv4/arp.c b/kernel/net/ipv4/arp.c
index 933a92820..59b3e0e8f 100644
--- a/kernel/net/ipv4/arp.c
+++ b/kernel/net/ipv4/arp.c
@@ -113,6 +113,8 @@
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
#include <linux/uaccess.h>
@@ -233,7 +235,7 @@ static int arp_constructor(struct neighbour *neigh)
return -EINVAL;
}
- neigh->type = inet_addr_type(dev_net(dev), addr);
+ neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
@@ -291,6 +293,39 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
kfree_skb(skb);
}
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw,
+ struct dst_entry *dst)
+{
+ struct sk_buff *skb;
+
+ /* arp on this interface. */
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ skb_dst_set(skb, dst_clone(dst));
+ arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+ target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
@@ -299,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
struct in_device *in_dev;
+ struct dst_entry *dst = NULL;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -309,7 +345,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(dev_net(dev),
+ if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
@@ -317,7 +353,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!skb)
break;
saddr = ip_hdr(skb)->saddr;
- if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+ if (inet_addr_type_dev_table(dev_net(dev), dev,
+ saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -346,8 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
}
- arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_hw, dev->dev_addr, NULL);
+ if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ dst = skb_dst(skb);
+ arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL, dst);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -585,48 +624,28 @@ out:
}
EXPORT_SYMBOL(arp_create);
+static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
/*
* Send an arp packet.
*/
void arp_xmit(struct sk_buff *skb)
{
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
- NULL, skb->dev, dev_queue_xmit_sk);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+ dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+ arp_xmit_finish);
}
EXPORT_SYMBOL(arp_xmit);
/*
- * Create and send an arp packet.
- */
-void arp_send(int type, int ptype, __be32 dest_ip,
- struct net_device *dev, __be32 src_ip,
- const unsigned char *dest_hw, const unsigned char *src_hw,
- const unsigned char *target_hw)
-{
- struct sk_buff *skb;
-
- /*
- * No arp on this interface.
- */
-
- if (dev->flags&IFF_NOARP)
- return;
-
- skb = arp_create(type, ptype, dest_ip, dev, src_ip,
- dest_hw, src_hw, target_hw);
- if (!skb)
- return;
-
- arp_xmit(skb);
-}
-EXPORT_SYMBOL(arp_send);
-
-/*
* Process an arp request.
*/
-static int arp_process(struct sock *sk, struct sk_buff *skb)
+static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -638,7 +657,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
- struct net *net = dev_net(dev);
+ struct dst_entry *reply_dst = NULL;
bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
@@ -739,13 +758,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
* cache.
*/
+ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
+ reply_dst = (struct dst_entry *)
+ iptunnel_metadata_reply(skb_metadata_dst(skb),
+ GFP_ATOMIC);
+
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- inet_addr_type(net, tip) == RTN_LOCAL &&
+ inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
- dev->dev_addr, sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
+ sha, dev->dev_addr, sha, reply_dst);
goto out;
}
@@ -764,9 +788,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
- dev, tip, sha, dev->dev_addr,
- sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
neigh_release(n);
}
}
@@ -784,13 +809,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
- arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
- dev, tip, sha, dev->dev_addr,
- sha);
+ arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+ sip, dev, tip, sha,
+ dev->dev_addr, sha,
+ reply_dst);
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
- return 0;
+ goto out_free_dst;
}
goto out;
}
@@ -802,16 +828,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);
+
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
- inet_addr_type(net, sip) == RTN_UNICAST;
+ addr_type == RTN_UNICAST;
if (!n &&
((arp->ar_op == htons(ARPOP_REPLY) &&
- inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
+ addr_type == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -842,12 +870,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
out:
consume_skb(skb);
+out_free_dst:
+ dst_release(reply_dst);
return 0;
}
static void parp_redo(struct sk_buff *skb)
{
- arp_process(NULL, skb);
+ arp_process(dev_net(skb->dev), NULL, skb);
}
@@ -880,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
- dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
+ dev_net(dev), NULL, skb, dev, NULL,
+ arp_process);
consumeskb:
consume_skb(skb);
@@ -1017,14 +1048,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ if (!(neigh->nud_state & NUD_NOARP)) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ err = 0;
+ }
neigh_release(neigh);
- err = 0;
}
return err;
}
diff --git a/kernel/net/ipv4/datagram.c b/kernel/net/ipv4/datagram.c
index 574fad9cc..f915abff1 100644
--- a/kernel/net/ipv4/datagram.c
+++ b/kernel/net/ipv4/datagram.c
@@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/kernel/net/ipv4/devinet.c b/kernel/net/ipv4/devinet.c
index 419d23c53..f6303b175 100644
--- a/kernel/net/ipv4/devinet.c
+++ b/kernel/net/ipv4/devinet.c
@@ -882,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
queue_delayed_work(system_power_efficient_wq,
&check_lifetime_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
- blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
}
return 0;
}
@@ -1645,7 +1644,8 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
-static size_t inet_get_link_af_size(const struct net_device *dev)
+static size_t inet_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
@@ -1655,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
}
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
@@ -1740,6 +1741,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (type == -1 || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ size += nla_total_size(4);
return size;
}
@@ -1780,6 +1783,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -1819,6 +1826,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_FORWARDING] = { .len = sizeof(int) },
[NETCONFA_RP_FILTER] = { .len = sizeof(int) },
[NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+ [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
@@ -1839,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
if (err < 0)
goto errout;
- err = EINVAL;
+ err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
goto errout;
@@ -2048,6 +2056,12 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
ifindex, cnf);
}
+ if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ ifindex, cnf);
+ }
}
return ret;
@@ -2169,6 +2183,8 @@ static struct devinet_sysctl_table {
"igmpv2_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
"igmpv3_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+ "ignore_routes_with_linkdown"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -2383,4 +2399,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, NULL);
}
-
diff --git a/kernel/net/ipv4/esp4.c b/kernel/net/ipv4/esp4.c
index 30b544f02..477937465 100644
--- a/kernel/net/ipv4/esp4.c
+++ b/kernel/net/ipv4/esp4.c
@@ -49,7 +49,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
len = ALIGN(len, crypto_tfm_ctx_alignment());
}
- len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
len = ALIGN(len, __alignof__(struct scatterlist));
len += sizeof(struct scatterlist) * nfrags;
@@ -68,17 +68,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
-static inline struct aead_givcrypt_request *esp_tmp_givreq(
- struct crypto_aead *aead, u8 *iv)
-{
- struct aead_givcrypt_request *req;
-
- req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
- crypto_tfm_ctx_alignment());
- aead_givcrypt_set_tfm(req, aead);
- return req;
-}
-
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
{
struct aead_request *req;
@@ -97,14 +86,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static inline struct scatterlist *esp_givreq_sg(
- struct crypto_aead *aead, struct aead_givcrypt_request *req)
-{
- return (void *)ALIGN((unsigned long)(req + 1) +
- crypto_aead_reqsize(aead),
- __alignof__(struct scatterlist));
-}
-
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -113,14 +94,37 @@ static void esp_output_done(struct crypto_async_request *base, int err)
xfrm_output_resume(skb, err);
}
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+ struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ __be32 *seqhi = esp_tmp_seqhi(tmp);
+
+ esph->seq_no = esph->spi;
+ esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+}
+
+static void esp_output_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_output_restore_header(skb);
+ esp_output_done(base, err);
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
- struct aead_givcrypt_request *req;
+ struct aead_request *req;
struct scatterlist *sg;
- struct scatterlist *asg;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -129,17 +133,19 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int clen;
int alen;
int plen;
+ int ivlen;
int tfclen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
+ __be64 seqno;
/* skb is pure payload to encrypt */
aead = x->data;
alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
tfclen = 0;
if (x->tfcpad) {
@@ -160,16 +166,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp) {
err = -ENOMEM;
goto error;
@@ -177,9 +181,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
- req = esp_tmp_givreq(aead, iv);
- asg = esp_givreq_sg(aead, req);
- sg = asg + sglists;
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -235,37 +238,53 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+ *seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ }
+
+ esph->spi = x->id.spi;
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
- esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
- clen + alen);
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
- if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
-
- aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
- aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, assoclen);
- aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low +
- ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+ aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+ aead_request_set_ad(req, assoclen);
+
+ seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ memset(iv, 0, ivlen);
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
+ min(ivlen, 8));
ESP_SKB_CB(skb)->tmp = tmp;
- err = crypto_aead_givencrypt(req);
- if (err == -EINPROGRESS)
+ err = crypto_aead_encrypt(req);
+
+ switch (err) {
+ case -EINPROGRESS:
goto error;
- if (err == -EBUSY)
+ case -EBUSY:
err = NET_XMIT_DROP;
+ break;
+
+ case 0:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_output_restore_header(skb);
+ }
kfree(tmp);
@@ -364,6 +383,20 @@ static void esp_input_done(struct crypto_async_request *base, int err)
xfrm_input_resume(skb, esp_input_done2(skb, err));
}
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+ esp_restore_header(skb, 0);
+ __skb_pull(skb, 4);
+}
+
+static void esp_input_done_esn(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ esp_input_restore_header(skb);
+ esp_input_done(base, err);
+}
+
/*
* Note: detecting truncated vs. non-truncated authentication data is very
* expensive, so we only support truncated data, which is the recommended
@@ -375,19 +408,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
- int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int ivlen = crypto_aead_ivsize(aead);
+ int elen = skb->len - sizeof(*esph) - ivlen;
int nfrags;
int assoclen;
- int sglists;
int seqhilen;
__be32 *seqhi;
void *tmp;
u8 *iv;
struct scatterlist *sg;
- struct scatterlist *asg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
goto out;
if (elen <= 0)
@@ -400,17 +432,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = err;
assoclen = sizeof(*esph);
- sglists = 1;
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- sglists += 2;
seqhilen += sizeof(__be32);
assoclen += seqhilen;
}
err = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
goto out;
@@ -418,36 +448,39 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
seqhi = esp_tmp_seqhi(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
- asg = esp_req_sg(aead, req);
- sg = asg + sglists;
+ sg = esp_req_sg(aead, req);
skb->ip_summed = CHECKSUM_NONE;
esph = (struct ip_esp_hdr *)skb->data;
- /* Get ivec. This can be wrong, check against another impls. */
- iv = esph->enc_data;
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
if ((x->props.flags & XFRM_STATE_ESN)) {
- sg_init_table(asg, 3);
- sg_set_buf(asg, &esph->spi, sizeof(__be32));
- *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
- sg_set_buf(asg + 1, seqhi, seqhilen);
- sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
- } else
- sg_init_one(asg, esph, sizeof(*esph));
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+ }
- aead_request_set_callback(req, 0, esp_input_done, skb);
- aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, assoclen);
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+ aead_request_set_ad(req, assoclen);
err = crypto_aead_decrypt(req);
if (err == -EINPROGRESS)
goto out;
+ if ((x->props.flags & XFRM_STATE_ESN))
+ esp_input_restore_header(skb);
+
err = esp_input_done2(skb, err);
out:
@@ -519,10 +552,16 @@ static void esp_destroy(struct xfrm_state *x)
static int esp_init_aead(struct xfrm_state *x)
{
+ char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = -ENAMETOOLONG;
+ if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+
+ aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -561,15 +600,19 @@ static int esp_init_authenc(struct xfrm_state *x)
if ((x->props.flags & XFRM_STATE_ESN)) {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authencesn(%s,%s)",
+ "%s%sauthencesn(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
- "authenc(%s,%s)",
+ "%s%sauthenc(%s,%s)%s",
+ x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ x->ealg->alg_name,
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
goto error;
}
diff --git a/kernel/net/ipv4/fib_frontend.c b/kernel/net/ipv4/fib_frontend.c
index 872494e6e..473447593 100644
--- a/kernel/net/ipv4/fib_frontend.c
+++ b/kernel/net/ipv4/fib_frontend.c
@@ -45,6 +45,8 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
+#include <net/l3mdev.h>
+#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -211,12 +213,12 @@ void fib_flush_external(struct net *net)
*/
static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
- __be32 addr)
+ __be32 addr, u32 tb_id)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
unsigned int ret = RTN_BROADCAST;
- struct fib_table *local_table;
+ struct fib_table *table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
@@ -225,10 +227,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
rcu_read_lock();
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (local_table) {
+ table = fib_get_table(net, tb_id);
+ if (table) {
ret = RTN_UNICAST;
- if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
@@ -238,19 +240,40 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
return ret;
}
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
+{
+ return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
unsigned int inet_addr_type(struct net *net, __be32 addr)
{
- return __inet_dev_addr_type(net, NULL, addr);
+ return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
}
EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, dev, addr, rt_table);
}
EXPORT_SYMBOL(inet_dev_addr_type);
+/* inet_addr_type with dev == NULL but using the table from a dev
+ * if one is associated
+ */
+unsigned int inet_addr_type_dev_table(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, NULL, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_addr_type_dev_table);
+
__be32 fib_compute_spec_dst(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -280,7 +303,8 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
- if (!fib_lookup(net, &fl4, &res))
+ fl4.flowi4_tun_key.tun_id = 0;
+ if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
scope = RT_SCOPE_LINK;
@@ -308,18 +332,24 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
+ if (!fl4.flowi4_iif)
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_flags = 0;
no_addr = idev->ifa_list == NULL;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+ trace_fib_validate_source(dev, &fl4);
+
net = dev_net(dev);
- if (fib_lookup(net, &fl4, &res))
+ if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
@@ -337,6 +367,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (nh->nh_dev == dev) {
dev_match = true;
break;
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ dev_match = true;
+ break;
}
}
#else
@@ -354,7 +387,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(net, &fl4, &res) == 0) {
+ if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
@@ -494,9 +527,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
addr = sk_extract_addr(&rt->rt_gateway);
if (rt->rt_gateway.sa_family == AF_INET && addr) {
+ unsigned int addr_type;
+
cfg->fc_gw = addr;
+ addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
if (rt->rt_flags & RTF_GATEWAY &&
- inet_addr_type(net, addr) == RTN_UNICAST)
+ addr_type == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
@@ -591,6 +627,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_FLOW] = { .type = NLA_U32 },
+ [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [RTA_ENCAP] = { .type = NLA_NESTED },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +694,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
+ case RTA_ENCAP:
+ cfg->fc_encap = attr;
+ break;
+ case RTA_ENCAP_TYPE:
+ cfg->fc_encap_type = nla_get_u16(attr);
+ break;
}
}
@@ -760,6 +804,7 @@ out:
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
+ u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -774,11 +819,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
},
};
- if (type == RTN_UNICAST)
- tb = fib_new_table(net, RT_TABLE_MAIN);
- else
- tb = fib_new_table(net, RT_TABLE_LOCAL);
+ if (!tb_id)
+ tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
+ tb = fib_new_table(net, tb_id);
if (!tb)
return;
@@ -823,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
@@ -870,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
} else if (!ipv4_is_zeronet(any) &&
(any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_DELROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
subnet = 1;
}
@@ -960,11 +1006,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
}
if (!(ok & LOCAL_OK)) {
+ unsigned int addr_type;
+
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
- if (gone &&
- inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
+ ifa->ifa_local);
+ if (gone && addr_type != RTN_LOCAL) {
/* And the last, but not the least thing.
* We must flush stray FIB entries.
*
@@ -1063,9 +1112,10 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, unsigned long event,
+ bool force)
{
- if (fib_sync_down_dev(dev, force))
+ if (fib_sync_down_dev(dev, event, force))
fib_flush(dev_net(dev));
rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
@@ -1081,7 +1131,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(dev_net(dev));
@@ -1093,7 +1143,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
/* Last address was deleted from this interface.
* Disable IP.
*/
- fib_disable_ip(dev, 1);
+ fib_disable_ip(dev, event, true);
} else {
rt_cache_flush(dev_net(dev));
}
@@ -1105,11 +1155,13 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *info;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ unsigned int flags;
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, event, true);
rt_flush_dev(dev);
return NOTIFY_DONE;
}
@@ -1124,18 +1176,32 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(dev);
+ fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, event, false);
break;
- case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
+ flags = dev_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ fib_sync_up(dev, RTNH_F_LINKDOWN);
+ else
+ fib_sync_down_dev(dev, event, false);
+ /* fall through */
+ case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
+ break;
}
return NOTIFY_DONE;
}
diff --git a/kernel/net/ipv4/fib_lookup.h b/kernel/net/ipv4/fib_lookup.h
index c6211ed60..9c0292072 100644
--- a/kernel/net/ipv4/fib_lookup.h
+++ b/kernel/net/ipv4/fib_lookup.h
@@ -13,6 +13,7 @@ struct fib_alias {
u8 fa_state;
u8 fa_slen;
u32 tb_id;
+ s16 fa_default;
struct rcu_head rcu;
};
diff --git a/kernel/net/ipv4/fib_rules.c b/kernel/net/ipv4/fib_rules.c
index 56151982f..f2bda9e89 100644
--- a/kernel/net/ipv4/fib_rules.c
+++ b/kernel/net/ipv4/fib_rules.c
@@ -47,11 +47,12 @@ struct fib4_rule {
#endif
};
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+ struct fib_result *res, unsigned int flags)
{
struct fib_lookup_arg arg = {
.result = res,
- .flags = FIB_LOOKUP_NOREF,
+ .flags = flags,
};
int err;
@@ -317,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
- .default_pref = fib_default_rule_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
diff --git a/kernel/net/ipv4/fib_semantics.c b/kernel/net/ipv4/fib_semantics.c
index 8d695b665..d97268e8f 100644
--- a/kernel/net/ipv4/fib_semantics.c
+++ b/kernel/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/nexthop.h>
+#include <net/lwtunnel.h>
#include "fib_lookup.h"
@@ -56,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
+ lwtstate_put(nexthop_nh->nh_lwtstate);
free_nh_exceptions(nexthop_nh);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,7 +267,8 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+ lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -318,7 +320,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_type == fi->fib_type &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
- ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -366,6 +368,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
if (fi->fib_nhs) {
+ size_t nh_encapsize = 0;
/* Also handles the special case fib_nhs == 1 */
/* each nexthop is packed in an attribute */
@@ -374,8 +377,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
+ /* grab encap info */
+ for_nexthops(fi) {
+ if (nh->nh_lwtstate) {
+ /* RTA_ENCAP_TYPE */
+ nh_encapsize += lwtunnel_get_encap_size(
+ nh->nh_lwtstate);
+ /* RTA_ENCAP */
+ nh_encapsize += nla_total_size(2);
+ }
+ } endfor_nexthops(fi);
+
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
+ payload += nla_total_size((fi->fib_nhs * nhsize) +
+ nh_encapsize);
+
}
return payload;
@@ -421,13 +437,15 @@ static int fib_detect_death(struct fib_info *fi, int order,
if (n) {
state = n->nud_state;
neigh_release(n);
+ } else {
+ return 0;
}
if (state == NUD_REACHABLE)
return 0;
if ((state & NUD_VALID) && order != dflt)
return 0;
if ((state & NUD_VALID) ||
- (*last_idx < 0 && order > dflt)) {
+ (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
*last_resort = fi;
*last_idx = order;
}
@@ -452,6 +470,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ int ret;
+
change_nexthops(fi) {
int attrlen;
@@ -475,18 +496,130 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (nexthop_nh->nh_tclassid)
fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
+ nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ if (nla) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ struct nlattr *nla_entype;
+
+ nla_entype = nla_find(attrs, attrlen,
+ RTA_ENCAP_TYPE);
+ if (!nla_entype)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ ret = lwtunnel_build_state(dev, nla_get_u16(
+ nla_entype),
+ nla, AF_INET, cfg,
+ &lwtstate);
+ if (ret)
+ goto errout;
+ nexthop_nh->nh_lwtstate =
+ lwtstate_get(lwtstate);
+ }
}
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
return 0;
+
+err_inval:
+ ret = -EINVAL;
+
+errout:
+ return ret;
}
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+ struct in_device *in_dev;
+
+ if (fi->fib_nhs < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+
+ net_get_random_once(&fib_multipath_secret,
+ sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+ const struct fib_nh *nh)
+{
+ fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_encap_match(struct net *net, u16 encap_type,
+ struct nlattr *encap,
+ int oif, const struct fib_nh *nh,
+ const struct fib_config *cfg)
+{
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+ int ret, result = 0;
+
+ if (encap_type == LWTUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (oif)
+ dev = __dev_get_by_index(net, oif);
+ ret = lwtunnel_build_state(dev, encap_type, encap,
+ AF_INET, cfg, &lwtstate);
+ if (!ret) {
+ result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+ lwtstate_free(lwtstate);
+ }
+
+ return result;
+}
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
+ struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -496,6 +629,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
return 1;
if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_encap) {
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, cfg->fc_oif,
+ fi->fib_nh, cfg))
+ return 1;
+ }
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
(!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
return 0;
@@ -585,7 +724,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
- int err;
+ int err = 0;
struct net *net;
struct net_device *dev;
@@ -594,16 +733,20 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_result res;
if (nh->nh_flags & RTNH_F_ONLINK) {
+ unsigned int addr_type;
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
- if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
- return -EINVAL;
dev = __dev_get_by_index(net, nh->nh_oif);
if (!dev)
return -ENODEV;
if (!(dev->flags & IFF_UP))
return -ENETDOWN;
+ addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
+ if (addr_type != RTN_UNICAST)
+ return -EINVAL;
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
@@ -611,6 +754,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
}
rcu_read_lock();
{
+ struct fib_table *tbl = NULL;
struct flowi4 fl4 = {
.daddr = nh->nh_gw,
.flowi4_scope = cfg->fc_scope + 1,
@@ -621,7 +765,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl4.flowi4_scope < RT_SCOPE_LINK)
fl4.flowi4_scope = RT_SCOPE_LINK;
- err = fib_lookup(net, &fl4, &res);
+
+ if (cfg->fc_table)
+ tbl = fib_get_table(net, cfg->fc_table);
+
+ if (tbl)
+ err = fib_table_lookup(tbl, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE |
+ FIB_LOOKUP_NOREF);
+
+ /* on error or if no table given do full lookup. This
+ * is needed for example when nexthops are in the local
+ * table rather than the given table
+ */
+ if (!tbl || err) {
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
+ }
+
if (err) {
rcu_read_unlock();
return err;
@@ -636,6 +797,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (!dev)
goto out;
dev_hold(dev);
+ if (!netif_carrier_ok(dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
@@ -654,6 +817,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->nh_dev))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
err = 0;
}
out:
@@ -713,8 +878,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *dest;
unsigned int new_hash;
- hlist_del(&fi->fib_hash);
-
new_hash = fib_info_hashfn(fi);
dest = &new_info_hash[new_hash];
hlist_add_head(&fi->fib_hash, dest);
@@ -731,8 +894,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *ldest;
unsigned int new_hash;
- hlist_del(&fi->fib_lhash);
-
new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
ldest = &new_laddrhash[new_hash];
hlist_add_head(&fi->fib_lhash, ldest);
@@ -757,6 +918,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
return nh->nh_saddr;
}
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fib_prefsrc != cfg->fc_dst) {
+ u32 tb_id = cfg->fc_table;
+ int rc;
+
+ if (tb_id == RT_TABLE_MAIN)
+ tb_id = RT_TABLE_LOCAL;
+
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id);
+
+ if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, RT_TABLE_LOCAL);
+ }
+
+ if (rc != RTN_LOCAL)
+ return false;
+ }
+ return true;
+}
+
+static int
+fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ if (!cfg->fc_mx)
+ return 0;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return -EINVAL;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ return -EINVAL;
+ fi->fib_metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+
struct fib_info *fib_create_info(struct fib_config *cfg)
{
int err;
@@ -829,36 +1058,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} endfor_nexthops(fi)
- if (cfg->fc_mx) {
- struct nlattr *nla;
- int remaining;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
-
- if (type) {
- u32 val;
-
- if (type > RTAX_MAX)
- goto err_inval;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err_inval;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_ADVMSS && val > 65535 - 40)
- val = 65535 - 40;
- if (type == RTAX_MTU && val > 65535 - 15)
- val = 65535 - 15;
- fi->fib_metrics[type - 1] = val;
- }
- }
- }
+ err = fib_convert_metrics(fi, cfg);
+ if (err)
+ goto failure;
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -879,6 +1081,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else {
struct fib_nh *nh = fi->fib_nh;
+ if (cfg->fc_encap) {
+ struct lwtunnel_state *lwtstate;
+ struct net_device *dev = NULL;
+
+ if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+ goto err_inval;
+ if (cfg->fc_oif)
+ dev = __dev_get_by_index(net, cfg->fc_oif);
+ err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ cfg->fc_encap, AF_INET, cfg,
+ &lwtstate);
+ if (err)
+ goto failure;
+
+ nh->nh_lwtstate = lwtstate_get(lwtstate);
+ }
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
@@ -924,24 +1142,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!nh->nh_dev)
goto failure;
} else {
+ int linkdown = 0;
+
change_nexthops(fi) {
err = fib_check_nh(cfg, fi, nexthop_nh);
if (err != 0)
goto failure;
+ if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+ linkdown++;
} endfor_nexthops(fi)
+ if (linkdown == fi->fib_nhs)
+ fi->fib_flags |= RTNH_F_LINKDOWN;
}
- if (fi->fib_prefsrc) {
- if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
- fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
- }
+ if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
+ goto err_inval;
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
+ fib_rebalance(fi);
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -1027,17 +1250,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
if (fi->fib_nhs == 1) {
+ struct in_device *in_dev;
+
if (fi->fib_nh->nh_gw &&
nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
goto nla_put_failure;
if (fi->fib_nh->nh_oif &&
nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
goto nla_put_failure;
+ if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtm->rtm_flags |= RTNH_F_DEAD;
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
+ if (fi->fib_nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1049,11 +1282,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
for_nexthops(fi) {
+ struct in_device *in_dev;
+
rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
if (!rtnh)
goto nla_put_failure;
rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+ if (nh->nh_flags & RTNH_F_LINKDOWN) {
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+ rtnh->rtnh_flags |= RTNH_F_DEAD;
+ }
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
@@ -1065,6 +1306,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
+ if (nh->nh_lwtstate)
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate);
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
@@ -1107,7 +1350,13 @@ int fib_sync_down_addr(struct net *net, __be32 local)
return ret;
}
-int fib_sync_down_dev(struct net_device *dev, int force)
+/* Event force Flags Description
+ * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
+ * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
+ * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
+ * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ */
+int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
int ret = 0;
int scope = RT_SCOPE_NOWHERE;
@@ -1133,49 +1382,79 @@ int fib_sync_down_dev(struct net_device *dev, int force)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
- nexthop_nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nexthop_nh->nh_power;
- nexthop_nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nexthop_nh->nh_dev == dev) {
+ if (event == NETDEV_UNREGISTER &&
+ nexthop_nh->nh_dev == dev) {
dead = fi->fib_nhs;
break;
}
#endif
} endfor_nexthops(fi)
if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ fi->fib_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ fi->fib_flags |= RTNH_F_LINKDOWN;
+ break;
+ }
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
struct hlist_head *fa_head = res->fa_head;
struct fib_table *tb = res->table;
+ u8 slen = 32 - res->prefixlen;
int order = -1, last_idx = -1;
- struct fib_alias *fa;
+ struct fib_alias *fa, *fa1 = NULL;
+ u32 last_prio = res->fi->fib_priority;
+ u8 last_tos = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ if (fa->fa_slen != slen)
+ continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
+ if (next_fi->fib_priority > last_prio &&
+ fa->fa_tos == last_tos) {
+ if (last_tos)
+ continue;
+ break;
+ }
+ if (next_fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ last_tos = fa->fa_tos;
+ last_prio = next_fi->fib_priority;
+
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
@@ -1185,10 +1464,11 @@ void fib_select_default(struct fib_result *res)
if (!fi) {
if (next_fi != res->fi)
break;
+ fa1 = fa;
} else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
+ &last_idx, fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
fi = next_fi;
@@ -1196,31 +1476,30 @@ void fib_select_default(struct fib_result *res)
}
if (order <= 0 || !fi) {
- tb->tb_default = -1;
+ if (fa1)
+ fa1->fa_default = -1;
goto out;
}
if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
+ fa1->fa_default)) {
fib_result_assign(res, fi);
- tb->tb_default = order;
+ fa1->fa_default = order;
goto out;
}
if (last_idx >= 0)
fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
+ fa1->fa_default = last_idx;
out:
return;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
*/
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
{
struct fib_info *prev_fi;
unsigned int hash;
@@ -1231,6 +1510,13 @@ int fib_sync_up(struct net_device *dev)
if (!(dev->flags & IFF_UP))
return 0;
+ if (nh_flags & RTNH_F_DEAD) {
+ unsigned int flags = dev_get_flags(dev);
+
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ nh_flags |= RTNH_F_LINKDOWN;
+ }
+
prev_fi = NULL;
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
@@ -1247,7 +1533,7 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & nh_flags)) {
alive++;
continue;
}
@@ -1258,71 +1544,57 @@ int fib_sync_up(struct net_device *dev)
!__in_dev_get_rtnl(dev))
continue;
alive++;
- spin_lock_bh(&fib_multipath_lock);
- nexthop_nh->nh_power = 0;
- nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
- spin_unlock_bh(&fib_multipath_lock);
+ nexthop_nh->nh_flags &= ~nh_flags;
} endfor_nexthops(fi)
if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
+ fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
}
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
- int w;
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- }
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
- }
-
-
- /* w should be random number [0..fi->fib_power-1],
- * it is pretty bad approximation.
- */
-
- w = jiffies % fi->fib_power;
+ for_nexthops(fi) {
+ if (hash > atomic_read(&nh->nh_upper_bound))
+ continue;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
- nexthop_nh->nh_power) {
- w -= nexthop_nh->nh_power;
- if (w <= 0) {
- nexthop_nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
- }
+ res->nh_sel = nhsel;
+ return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (mp_hash < 0)
+ mp_hash = get_hash_from_flowi4(fl4) >> 1;
+
+ fib_select_multipath(res, mp_hash);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(fl4, res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/kernel/net/ipv4/fib_trie.c b/kernel/net/ipv4/fib_trie.c
index 0ca933db1..744e5936c 100644
--- a/kernel/net/ipv4/fib_trie.c
+++ b/kernel/net/ipv4/fib_trie.c
@@ -72,6 +72,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,6 +81,7 @@
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/switchdev.h>
+#include <trace/events/fib.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
@@ -324,13 +326,15 @@ static inline void empty_child_dec(struct key_vector *n)
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
- struct key_vector *l = kv->kv;
+ struct key_vector *l;
+ struct tnode *kv;
+ kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (!kv)
return NULL;
/* initialize key vector */
+ l = kv->kv;
l->key = key;
l->pos = 0;
l->bits = 0;
@@ -345,24 +349,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- struct tnode *tnode = tnode_alloc(bits);
unsigned int shift = pos + bits;
- struct key_vector *tn = tnode->kv;
+ struct key_vector *tn;
+ struct tnode *tnode;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
- pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
- sizeof(struct key_vector *) << bits);
-
+ tnode = tnode_alloc(bits);
if (!tnode)
return NULL;
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
if (bits == KEYLENGTH)
tnode->full_children = 1;
else
tnode->empty_children = 1ul << bits;
+ tn = tnode->kv;
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
tn->pos = pos;
tn->bits = bits;
@@ -1077,6 +1083,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
+ unsigned int nlflags = 0;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1165,14 +1172,15 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
- err = netdev_switch_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
+ err = switchdev_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
if (err) {
- netdev_switch_fib_ipv4_abort(fi);
+ switchdev_fib_ipv4_abort(fi);
kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
}
@@ -1196,7 +1204,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (fa_match)
goto out;
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ if (cfg->fc_nlflags & NLM_F_APPEND)
+ nlflags = NLM_F_APPEND;
+ else
fa = fa_first;
}
err = -ENOENT;
@@ -1214,14 +1224,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
+ new_fa->fa_default = -1;
/* (Optionally) offload fib entry to switch hardware. */
- err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
+ err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
+ cfg->fc_nlflags, tb->tb_id);
if (err) {
- netdev_switch_fib_ipv4_abort(fi);
+ switchdev_fib_ipv4_abort(fi);
goto out_free_new_fa;
}
@@ -1235,12 +1244,12 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
- &cfg->fc_nlinfo, 0);
+ &cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
out_sw_fib_del:
- netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+ switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1270,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
unsigned long index;
t_key cindex;
+ trace_fib_table_lookup(tb->tb_id, flp);
+
pn = t->kv;
cindex = 0;
@@ -1406,11 +1417,20 @@ found:
continue;
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
const struct fib_nh *nh = &fi->fib_nh[nhsel];
+ struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
if (nh->nh_flags & RTNH_F_DEAD)
continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN &&
+ !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
+ if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
+ if (flp->flowi4_oif &&
+ flp->flowi4_oif != nh->nh_oif)
+ continue;
+ }
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&fi->fib_clntref);
@@ -1425,6 +1445,8 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
+ trace_fib_table_lookup_nh(nh);
+
return err;
}
}
@@ -1518,8 +1540,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
+ switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1547,7 +1569,7 @@ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
do {
/* record parent and next child index */
pn = n;
- cindex = key ? get_index(key, pn) : 0;
+ cindex = (key > pn->key) ? get_index(key, pn) : 0;
if (cindex >> pn->bits)
break;
@@ -1768,10 +1790,9 @@ void fib_table_flush_external(struct fib_table *tb)
if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
continue;
- netdev_switch_fib_ipv4_del(n->key,
- KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos,
- fa->fa_type, tb->tb_id);
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
}
/* update leaf slen */
@@ -1834,10 +1855,9 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- netdev_switch_fib_ipv4_del(n->key,
- KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos,
- fa->fa_type, tb->tb_id);
+ switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1976,7 +1996,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
return NULL;
tb->tb_id = id;
- tb->tb_default = -1;
tb->tb_num_default = 0;
tb->tb_data = (alias ? alias->__data : tb->__data);
@@ -2053,11 +2072,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct key_vector *n, *pn = t->kv;
+ struct key_vector *n, *pn;
if (!t)
return NULL;
+ pn = t->kv;
n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
diff --git a/kernel/net/ipv4/fou.c b/kernel/net/ipv4/fou.c
index 34968cd5c..bd903fe0f 100644
--- a/kernel/net/ipv4/fou.c
+++ b/kernel/net/ipv4/fou.c
@@ -24,6 +24,7 @@ struct fou {
u16 type;
struct udp_offload udp_offloads;
struct list_head list;
+ struct rcu_head rcu;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
@@ -79,7 +80,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
__be16 *pd = data;
size_t start = ntohs(pd[0]);
size_t offset = ntohs(pd[1]);
- size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ size_t plen = sizeof(struct udphdr) + hdrlen +
+ max_t(size_t, offset + sizeof(u16), start);
+
+ if (skb->remcsum_offload)
+ return guehdr;
if (!pskb_may_pull(skb, plen))
return NULL;
@@ -221,29 +226,21 @@ out_unlock:
static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
struct guehdr *guehdr, void *data,
- size_t hdrlen, u8 ipproto,
- struct gro_remcsum *grc, bool nopartial)
+ size_t hdrlen, struct gro_remcsum *grc,
+ bool nopartial)
{
__be16 *pd = data;
size_t start = ntohs(pd[0]);
size_t offset = ntohs(pd[1]);
- size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
if (skb->remcsum_offload)
- return NULL;
+ return guehdr;
if (!NAPI_GRO_CB(skb)->csum_valid)
return NULL;
- /* Pull checksum that will be written */
- if (skb_gro_header_hard(skb, off + plen)) {
- guehdr = skb_gro_header_slow(skb, off + plen, off);
- if (!guehdr)
- return NULL;
- }
-
- skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
- start, offset, grc, nopartial);
+ guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+ start, offset, grc, nopartial);
skb->remcsum_offload = 1;
@@ -307,10 +304,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
if (flags & GUE_PFLAG_REMCSUM) {
guehdr = gue_gro_remcsum(skb, off, guehdr,
- data + doffset, hdrlen,
- guehdr->proto_ctype, &grc,
+ data + doffset, hdrlen, &grc,
!!(fou->flags &
FOU_F_REMCSUM_NOPARTIAL));
+
if (!guehdr)
goto out;
@@ -351,7 +348,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[guehdr->proto_ctype]);
- if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
@@ -421,7 +418,7 @@ static void fou_release(struct fou *fou)
list_del(&fou->list);
udp_tunnel_sock_release(sock);
- kfree(fou);
+ kfree_rcu(fou, rcu);
}
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
@@ -570,7 +567,7 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
- if (family != AF_INET && family != AF_INET6)
+ if (family != AF_INET)
return -EINVAL;
cfg->udp_config.family = family;
diff --git a/kernel/net/ipv4/geneve.c b/kernel/net/ipv4/geneve.c
deleted file mode 100644
index 8986e63f3..000000000
--- a/kernel/net/ipv4/geneve.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
- * Geneve: Generic Network Virtualization Encapsulation
- *
- * Copyright (c) 2014 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/igmp.h>
-#include <linux/etherdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/mutex.h>
-#include <net/arp.h>
-#include <net/ndisc.h>
-#include <net/ip.h>
-#include <net/ip_tunnels.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/geneve.h>
-#include <net/protocol.h>
-#include <net/udp_tunnel.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-#include <net/ip6_tunnel.h>
-#include <net/ip6_checksum.h>
-#endif
-
-/* Protects sock_list and refcounts. */
-static DEFINE_MUTEX(geneve_mutex);
-
-/* per-network namespace private data for this module */
-struct geneve_net {
- struct list_head sock_list;
-};
-
-static int geneve_net_id;
-
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
- return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
-
-static struct geneve_sock *geneve_find_sock(struct net *net,
- sa_family_t family, __be16 port)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
- struct geneve_sock *gs;
-
- list_for_each_entry(gs, &gn->sock_list, list) {
- if (inet_sk(gs->sock->sk)->inet_sport == port &&
- inet_sk(gs->sock->sk)->sk.sk_family == family)
- return gs;
- }
-
- return NULL;
-}
-
-static void geneve_build_header(struct genevehdr *geneveh,
- __be16 tun_flags, u8 vni[3],
- u8 options_len, u8 *options)
-{
- geneveh->ver = GENEVE_VER;
- geneveh->opt_len = options_len / 4;
- geneveh->oam = !!(tun_flags & TUNNEL_OAM);
- geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
- geneveh->rsvd1 = 0;
- memcpy(geneveh->vni, vni, 3);
- geneveh->proto_type = htons(ETH_P_TEB);
- geneveh->rsvd2 = 0;
-
- memcpy(geneveh->options, options, options_len);
-}
-
-/* Transmit a fully formatted Geneve frame.
- *
- * When calling this function. The skb->data should point
- * to the geneve header which is fully formed.
- *
- * This function will add other UDP tunnel headers.
- */
-int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
- struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool csum, bool xnet)
-{
- struct genevehdr *gnvh;
- int min_headroom;
- int err;
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
-
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- return err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb))
- return -ENOMEM;
-
- skb = udp_tunnel_handle_offloads(skb, csum);
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
- gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
- geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
-
- return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
- tos, ttl, df, src_port, dst_port, xnet,
- !csum);
-}
-EXPORT_SYMBOL_GPL(geneve_xmit_skb);
-
-static int geneve_hlen(struct genevehdr *gh)
-{
- return sizeof(*gh) + gh->opt_len * 4;
-}
-
-static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
-{
- struct sk_buff *p, **pp = NULL;
- struct genevehdr *gh, *gh2;
- unsigned int hlen, gh_len, off_gnv;
- const struct packet_offload *ptype;
- __be16 type;
- int flush = 1;
-
- off_gnv = skb_gro_offset(skb);
- hlen = off_gnv + sizeof(*gh);
- gh = skb_gro_header_fast(skb, off_gnv);
- if (skb_gro_header_hard(skb, hlen)) {
- gh = skb_gro_header_slow(skb, hlen, off_gnv);
- if (unlikely(!gh))
- goto out;
- }
-
- if (gh->ver != GENEVE_VER || gh->oam)
- goto out;
- gh_len = geneve_hlen(gh);
-
- hlen = off_gnv + gh_len;
- if (skb_gro_header_hard(skb, hlen)) {
- gh = skb_gro_header_slow(skb, hlen, off_gnv);
- if (unlikely(!gh))
- goto out;
- }
-
- flush = 0;
-
- for (p = *head; p; p = p->next) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- gh2 = (struct genevehdr *)(p->data + off_gnv);
- if (gh->opt_len != gh2->opt_len ||
- memcmp(gh, gh2, gh_len)) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
- }
-
- type = gh->proto_type;
-
- rcu_read_lock();
- ptype = gro_find_receive_by_type(type);
- if (!ptype) {
- flush = 1;
- goto out_unlock;
- }
-
- skb_gro_pull(skb, gh_len);
- skb_gro_postpull_rcsum(skb, gh, gh_len);
- pp = ptype->callbacks.gro_receive(head, skb);
-
-out_unlock:
- rcu_read_unlock();
-out:
- NAPI_GRO_CB(skb)->flush |= flush;
-
- return pp;
-}
-
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
-{
- struct genevehdr *gh;
- struct packet_offload *ptype;
- __be16 type;
- int gh_len;
- int err = -ENOSYS;
-
- udp_tunnel_gro_complete(skb, nhoff);
-
- gh = (struct genevehdr *)(skb->data + nhoff);
- gh_len = geneve_hlen(gh);
- type = gh->proto_type;
-
- rcu_read_lock();
- ptype = gro_find_complete_by_type(type);
- if (ptype)
- err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
-
- rcu_read_unlock();
- return err;
-}
-
-static void geneve_notify_add_rx_port(struct geneve_sock *gs)
-{
- struct sock *sk = gs->sock->sk;
- sa_family_t sa_family = sk->sk_family;
- int err;
-
- if (sa_family == AF_INET) {
- err = udp_add_offload(&gs->udp_offloads);
- if (err)
- pr_warn("geneve: udp_add_offload failed with status %d\n",
- err);
- }
-}
-
-static void geneve_notify_del_rx_port(struct geneve_sock *gs)
-{
- struct sock *sk = gs->sock->sk;
- sa_family_t sa_family = sk->sk_family;
-
- if (sa_family == AF_INET)
- udp_del_offload(&gs->udp_offloads);
-}
-
-/* Callback from net/ipv4/udp.c to receive packets */
-static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct genevehdr *geneveh;
- struct geneve_sock *gs;
- int opts_len;
-
- /* Need Geneve and inner Ethernet header to be present */
- if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
- goto error;
-
- /* Return packets with reserved bits set */
- geneveh = geneve_hdr(skb);
-
- if (unlikely(geneveh->ver != GENEVE_VER))
- goto error;
-
- if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
- goto error;
-
- opts_len = geneveh->opt_len * 4;
- if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
- htons(ETH_P_TEB)))
- goto drop;
-
- gs = rcu_dereference_sk_user_data(sk);
- if (!gs)
- goto drop;
-
- gs->rcv(gs, skb);
- return 0;
-
-drop:
- /* Consume bad packet */
- kfree_skb(skb);
- return 0;
-
-error:
- /* Let the UDP layer deal with the skb */
- return 1;
-}
-
-static struct socket *geneve_create_sock(struct net *net, bool ipv6,
- __be16 port)
-{
- struct socket *sock;
- struct udp_port_cfg udp_conf;
- int err;
-
- memset(&udp_conf, 0, sizeof(udp_conf));
-
- if (ipv6) {
- udp_conf.family = AF_INET6;
- } else {
- udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
- }
-
- udp_conf.local_udp_port = port;
-
- /* Open UDP socket */
- err = udp_sock_create(net, &udp_conf, &sock);
- if (err < 0)
- return ERR_PTR(err);
-
- return sock;
-}
-
-/* Create new listen socket if needed */
-static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool ipv6)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
- struct geneve_sock *gs;
- struct socket *sock;
- struct udp_tunnel_sock_cfg tunnel_cfg;
-
- gs = kzalloc(sizeof(*gs), GFP_KERNEL);
- if (!gs)
- return ERR_PTR(-ENOMEM);
-
- sock = geneve_create_sock(net, ipv6, port);
- if (IS_ERR(sock)) {
- kfree(gs);
- return ERR_CAST(sock);
- }
-
- gs->sock = sock;
- gs->refcnt = 1;
- gs->rcv = rcv;
- gs->rcv_data = data;
-
- /* Initialize the geneve udp offloads structure */
- gs->udp_offloads.port = port;
- gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
- gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
- geneve_notify_add_rx_port(gs);
-
- /* Mark socket as an encapsulation socket */
- tunnel_cfg.sk_user_data = gs;
- tunnel_cfg.encap_type = 1;
- tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
- tunnel_cfg.encap_destroy = NULL;
- setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
-
- list_add(&gs->list, &gn->sock_list);
-
- return gs;
-}
-
-struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool no_share, bool ipv6)
-{
- struct geneve_sock *gs;
-
- mutex_lock(&geneve_mutex);
-
- gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
- if (gs) {
- if (!no_share && gs->rcv == rcv)
- gs->refcnt++;
- else
- gs = ERR_PTR(-EBUSY);
- } else {
- gs = geneve_socket_create(net, port, rcv, data, ipv6);
- }
-
- mutex_unlock(&geneve_mutex);
-
- return gs;
-}
-EXPORT_SYMBOL_GPL(geneve_sock_add);
-
-void geneve_sock_release(struct geneve_sock *gs)
-{
- mutex_lock(&geneve_mutex);
-
- if (--gs->refcnt)
- goto unlock;
-
- list_del(&gs->list);
- geneve_notify_del_rx_port(gs);
- udp_tunnel_sock_release(gs->sock);
- kfree_rcu(gs, rcu);
-
-unlock:
- mutex_unlock(&geneve_mutex);
-}
-EXPORT_SYMBOL_GPL(geneve_sock_release);
-
-static __net_init int geneve_init_net(struct net *net)
-{
- struct geneve_net *gn = net_generic(net, geneve_net_id);
-
- INIT_LIST_HEAD(&gn->sock_list);
-
- return 0;
-}
-
-static struct pernet_operations geneve_net_ops = {
- .init = geneve_init_net,
- .id = &geneve_net_id,
- .size = sizeof(struct geneve_net),
-};
-
-static int __init geneve_init_module(void)
-{
- int rc;
-
- rc = register_pernet_subsys(&geneve_net_ops);
- if (rc)
- return rc;
-
- pr_info("Geneve driver\n");
-
- return 0;
-}
-module_init(geneve_init_module);
-
-static void __exit geneve_cleanup_module(void)
-{
- unregister_pernet_subsys(&geneve_net_ops);
-}
-module_exit(geneve_cleanup_module);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
-MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
-MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/kernel/net/ipv4/gre_demux.c b/kernel/net/ipv4/gre_demux.c
index 4a7b5b2a1..d9c552a72 100644
--- a/kernel/net/ipv4/gre_demux.c
+++ b/kernel/net/ipv4/gre_demux.c
@@ -31,7 +31,6 @@
#include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
-void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(tpi->flags);
- greh->protocol = tpi->proto;
-
- if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (tpi->flags&TUNNEL_SEQ) {
- *ptr = tpi->seq;
- ptr--;
- }
- if (tpi->flags&TUNNEL_KEY) {
- *ptr = tpi->key;
- ptr--;
- }
- if (tpi->flags&TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type &
- (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
- }
- }
-}
-EXPORT_SYMBOL_GPL(gre_build_header);
-
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err)
-{
- const struct gre_base_hdr *greh;
- __be32 *options;
- int hdr_len;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
-
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
-
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else
- tpi->key = 0;
-
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else
- tpi->seq = 0;
-
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- hdr_len += 4;
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
- }
- }
-
- return iptunnel_pull_header(skb, hdr_len, tpi->proto);
-}
-
-static int gre_cisco_rcv(struct sk_buff *skb)
-{
- struct tnl_ptk_info tpi;
- int i;
- bool csum_err = false;
-
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- }
-#endif
-
- if (parse_gre_header(skb, &tpi, &csum_err) < 0)
- goto drop;
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
- int ret;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
- ret = proto->handler(skb, &tpi);
- if (ret == PACKET_RCVD) {
- rcu_read_unlock();
- return 0;
- }
- }
- rcu_read_unlock();
-
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-drop:
- kfree_skb(skb);
- return 0;
-}
-
-static void gre_cisco_err(struct sk_buff *skb, u32 info)
-{
- /* All the routers (except for Linux) return only
- * 8 bytes of packet payload. It means, that precise relaying of
- * ICMP in the real Internet is absolutely infeasible.
- *
- * Moreover, Cisco "wise men" put GRE key to the third word
- * in GRE header. It makes impossible maintaining even soft
- * state for keyed
- * GRE tunnels with enabled checksum. Tell them "thank you".
- *
- * Well, I wonder, rfc1812 was written by Cisco employee,
- * what the hell these idiots break standards established
- * by themselves???
- */
-
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct tnl_ptk_info tpi;
- bool csum_err = false;
- int i;
-
- if (parse_gre_header(skb, &tpi, &csum_err)) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
-
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
- return;
- }
- if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
- return;
- }
-
- rcu_read_lock();
- for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
- struct gre_cisco_protocol *proto;
-
- proto = rcu_dereference(gre_cisco_proto_list[i]);
- if (!proto)
- continue;
-
- if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
- goto out;
-
- }
-out:
- rcu_read_unlock();
-}
-
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
@@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = {
.netns_ok = 1,
};
-static const struct gre_protocol ipgre_protocol = {
- .handler = gre_cisco_rcv,
- .err_handler = gre_cisco_err,
-};
-
-int gre_cisco_register(struct gre_cisco_protocol *newp)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[newp->priority];
-
- return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_register);
-
-int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
-{
- struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
- &gre_cisco_proto_list[del_proto->priority];
- int ret;
-
- ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
-
- if (ret)
- return ret;
-
- synchronize_net();
- return 0;
-}
-EXPORT_SYMBOL_GPL(gre_cisco_unregister);
-
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
- goto err;
- }
-
- if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
- pr_info("%s: can't add ipgre handler\n", __func__);
- goto err_gre;
+ return -EAGAIN;
}
-
return 0;
-err_gre:
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-err:
- return -EAGAIN;
}
static void __exit gre_exit(void)
{
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/kernel/net/ipv4/gre_offload.c b/kernel/net/ipv4/gre_offload.c
index 5aa46d4b4..5a8ee3282 100644
--- a/kernel/net/ipv4/gre_offload.c
+++ b/kernel/net/ipv4/gre_offload.c
@@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP)))
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT)))
goto out;
if (!skb->encapsulation)
diff --git a/kernel/net/ipv4/icmp.c b/kernel/net/ipv4/icmp.c
index be5fd9b81..74314d95d 100644
--- a/kernel/net/ipv4/icmp.c
+++ b/kernel/net/ipv4/icmp.c
@@ -97,6 +97,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
+#include <net/l3mdev.h>
/*
* Build xmit assembly blocks
@@ -309,9 +310,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
rc = false;
if (icmp_global_allow()) {
+ int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
if (peer)
@@ -426,6 +428,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
+ fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -438,6 +441,22 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Source and destination is swapped. See ip_multipath_icmp_hash */
+static int icmp_multipath_hash_skb(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return fib_multipath_hash(iph->daddr, iph->saddr);
+}
+
+#else
+
+#define icmp_multipath_hash_skb(skb) (-1)
+
+#endif
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -459,8 +478,11 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
+ fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
+
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_hash(net, fl4,
+ icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
@@ -481,7 +503,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ if (inet_addr_type_dev_table(net, skb_in->dev,
+ fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
err = PTR_ERR(rt2);
@@ -497,6 +520,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
}
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
+ skb_dst_set(skb_in, NULL);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
RT_TOS(tos), rt2->dst.dev);
@@ -829,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb)
*/
if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
- inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+ inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
&ip_hdr(skb)->saddr,
icmph->type, icmph->code,
diff --git a/kernel/net/ipv4/igmp.c b/kernel/net/ipv4/igmp.c
index a3a697f5f..05e4cba14 100644
--- a/kernel/net/ipv4/igmp.c
+++ b/kernel/net/ipv4/igmp.c
@@ -110,6 +110,9 @@
#define IP_MAX_MEMBERSHIPS 20
#define IP_MAX_MSF 10
+/* IGMP reports for link-local multicast groups are enabled by default */
+int sysctl_igmp_llm_reports __read_mostly = 1;
+
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
@@ -394,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(skb);
+ return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
+ if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+ return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
@@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(pmc->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
@@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- else if (type == IGMP_HOST_LEAVE_MESSAGE)
+
+ if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ return 0;
+
+ if (type == IGMP_HOST_LEAVE_MESSAGE)
dst = IGMP_ALL_ROUTER;
else
dst = group;
@@ -727,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
if (group == IGMP_ALL_HOSTS)
return false;
+ if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ return false;
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
@@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
continue;
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
im->gsquery = im->gsquery && mark;
@@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ return;
reporter = im->reporter;
igmp_stop_timer(im);
@@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ return;
if (in_dev->dead)
return;
@@ -1339,6 +1360,171 @@ out:
}
EXPORT_SYMBOL(ip_mc_inc_group);
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ unsigned int len;
+ unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+ return -EINVAL;
+
+ offset += ip_hdrlen(skb) - sizeof(*iph);
+
+ if (!pskb_may_pull(skb, offset))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ return -EINVAL;
+
+ len = skb_network_offset(skb) + ntohs(iph->tot_len);
+ if (skb->len < len || len < offset)
+ return -EINVAL;
+
+ skb_set_transport_header(skb, offset);
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmpv3_report);
+
+ return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+ unsigned int len = skb_transport_offset(skb);
+
+ len += sizeof(struct igmphdr);
+ if (skb->len < len)
+ return -EINVAL;
+
+ /* IGMPv{1,2}? */
+ if (skb->len != len) {
+ /* or IGMPv3? */
+ len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+ if (skb->len < len || !pskb_may_pull(skb, len))
+ return -EINVAL;
+ }
+
+ /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+ * all-systems destination addresses (224.0.0.1) for general queries
+ */
+ if (!igmp_hdr(skb)->group &&
+ ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+ switch (igmp_hdr(skb)->type) {
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ /* fall through */
+ return 0;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ return ip_mc_check_igmp_reportv3(skb);
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ return ip_mc_check_igmp_query(skb);
+ default:
+ return -ENOMSG;
+ }
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+ return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+ struct sk_buff *skb_chk;
+ unsigned int transport_len;
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+ int ret = -EINVAL;
+
+ transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+ skb_chk = skb_checksum_trimmed(skb, transport_len,
+ ip_mc_validate_checksum);
+ if (!skb_chk)
+ goto err;
+
+ if (!pskb_may_pull(skb_chk, len))
+ goto err;
+
+ ret = ip_mc_check_igmp_msg(skb_chk);
+ if (ret)
+ goto err;
+
+ if (skb_trimmed)
+ *skb_trimmed = skb_chk;
+ /* free now unneeded clone */
+ else if (skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ ret = 0;
+
+err:
+ if (ret && skb_chk && skb_chk != skb)
+ kfree_skb(skb_chk);
+
+ return ret;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ * standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+ int ret = ip_mc_check_iphdr(skb);
+
+ if (ret < 0)
+ return ret;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return -ENOMSG;
+
+ return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
/*
* Resend IGMP JOIN report; used by netdev notifier.
*/
@@ -1353,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !sysctl_igmp_llm_reports)
+ continue;
/* a failover is happening and switches
* must be notified immediately
@@ -1937,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
ASSERT_RTNL();
in_dev = ip_mc_find_dev(net, imr);
- if (!in_dev) {
+ if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
ret = -ENODEV;
goto out;
}
@@ -1958,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- ip_mc_dec_group(in_dev, group);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2203,11 +2393,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_sf_socklist *psl;
struct net *net = sock_net(sk);
+ ASSERT_RTNL();
+
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
@@ -2228,7 +2418,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
if (!psl) {
len = 0;
count = 0;
@@ -2247,7 +2436,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
return -EFAULT;
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2261,6 +2449,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ ASSERT_RTNL();
+
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
@@ -2268,8 +2458,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
@@ -2281,7 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
@@ -2301,7 +2488,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
}
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2380,7 +2566,7 @@ void ip_mc_drop_socket(struct sock *sk)
}
/* called with rcu_read_lock() */
-int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
diff --git a/kernel/net/ipv4/inet_connection_sock.c b/kernel/net/ipv4/inet_connection_sock.c
index b27fc401c..641489148 100644
--- a/kernel/net/ipv4/inet_connection_sock.c
+++ b/kernel/net/ipv4/inet_connection_sock.c
@@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
+ int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
local_bh_disable();
if (!snum) {
@@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
again:
inet_get_local_port_range(net, &low, &high);
+ if (attempt_half) {
+ int half = low + ((high - low) >> 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
remaining = (high - low) + 1;
smallest_rover = rover = prandom_u32() % remaining + low;
@@ -127,11 +136,6 @@ again:
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
- !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
- snum = smallest_rover;
- goto tb_found;
- }
}
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
@@ -159,6 +163,11 @@ again:
snum = smallest_rover;
goto have_snum;
}
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto again;
+ }
goto fail;
}
/* OK, here is the one we will use. HEAD is
@@ -321,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
- req = reqsk_queue_remove(queue);
+ req = reqsk_queue_remove(queue, sk);
newsk = req->sk;
- sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_rsk(req)->tfo_listener &&
- queue->fastopenq) {
- spin_lock_bh(&queue->fastopenq->lock);
+ tcp_rsk(req)->tfo_listener) {
+ spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
@@ -339,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
req->sk = NULL;
req = NULL;
}
- spin_unlock_bh(&queue->fastopenq->lock);
+ spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk);
@@ -399,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
}
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
-struct dst_entry *inet_csk_route_req(struct sock *sk,
+struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
{
@@ -430,7 +437,7 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
-struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
const struct request_sock *req)
{
@@ -469,65 +476,12 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
-{
- return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
-}
-
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) true
#endif
-/* Note: this is temporary :
- * req sock will no longer be in listener hash table
-*/
-struct request_sock *inet_csk_search_req(struct sock *sk,
- const __be16 rport,
- const __be32 raddr,
- const __be32 laddr)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req;
- u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries);
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
- for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->ir_rmt_port == rport &&
- ireq->ir_rmt_addr == raddr &&
- ireq->ir_loc_addr == laddr &&
- AF_INET_FAMILY(req->rsk_ops->family)) {
- atomic_inc(&req->rsk_refcnt);
- WARN_ON(req->sk);
- break;
- }
- }
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return req;
-}
-EXPORT_SYMBOL_GPL(inet_csk_search_req);
-
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
-}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
-
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
@@ -554,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
req->num_timeout >= rskq_defer_accept - 1;
}
-int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
{
int err = req->rsk_ops->rtx_syn_ack(parent, req);
@@ -564,27 +518,21 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-/* return true if req was found in the syn_table[] */
+/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req)
{
- struct listen_sock *lopt = queue->listen_opt;
- struct request_sock **prev;
+ struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
- spin_lock(&queue->syn_wait_lock);
+ if (sk_hashed(req_to_sk(req))) {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
- for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
- prev = &(*prev)->dl_next) {
- if (*prev == req) {
- *prev = req->dl_next;
- found = true;
- break;
- }
+ spin_lock(lock);
+ found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ spin_unlock(lock);
}
-
- spin_unlock(&queue->syn_wait_lock);
- if (del_timer_sync(&req->rsk_timer))
+ if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
}
@@ -598,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct listen_sock *lopt = queue->listen_opt;
int qlen, expire = 0, resend = 0;
int max_retries, thresh;
u8 defer_accept;
- if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
- reqsk_put(req);
- return;
- }
+ if (sk_state_load(sk_listener) != TCP_LISTEN)
+ goto drop;
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries;
@@ -633,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- qlen = listen_sock_qlen(lopt);
- if (qlen >> (lopt->max_qlen_log - 1)) {
- int young = listen_sock_young(lopt) << 1;
+ qlen = reqsk_queue_len(queue);
+ if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
if (qlen < young)
@@ -657,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data)
unsigned long timeo;
if (req->num_timeout++ == 0)
- atomic_inc(&lopt->young_dec);
+ atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return;
}
- inet_csk_reqsk_queue_drop(sk_listener, req);
- reqsk_put(req);
+drop:
+ inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
-void reqsk_queue_hash_req(struct request_sock_queue *queue,
- u32 hash, struct request_sock *req,
- unsigned long timeout)
+static void reqsk_queue_hash_req(struct request_sock *req,
+ unsigned long timeout)
{
- struct listen_sock *lopt = queue->listen_opt;
-
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+
+ inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
- atomic_set(&req->rsk_refcnt, 2);
- setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
- req->rsk_hash = hash;
-
- spin_lock(&queue->syn_wait_lock);
- req->dl_next = lopt->syn_table[hash];
- lopt->syn_table[hash] = req;
- spin_unlock(&queue->syn_wait_lock);
+ atomic_set(&req->rsk_refcnt, 2 + 1);
+}
- mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ reqsk_queue_hash_req(req, timeout);
+ inet_csk_reqsk_queue_added(sk);
}
-EXPORT_SYMBOL(reqsk_queue_hash_req);
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -782,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+int inet_csk_listen_start(struct sock *sk, int backlog)
{
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+ struct inet_sock *inet = inet_sk(sk);
- if (rc != 0)
- return rc;
+ reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
+ sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -800,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk->sk_state = TCP_LISTEN;
+ sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
@@ -811,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
}
sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+ reqsk_put(req);
+}
+
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+ struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ child = NULL;
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ queue->rskq_accept_head = req;
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+ return child;
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+ struct request_sock *req, bool own_req)
+{
+ if (own_req) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ if (inet_csk_reqsk_queue_add(sk, req, child))
+ return child;
+ }
+ /* Too bad, another child took ownership of the request, undo. */
+ bh_unlock_sock(child);
+ sock_put(child);
+ return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
@@ -824,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct request_sock *acc_req;
- struct request_sock *req;
-
- /* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(queue);
+ struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -838,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(queue);
-
- while ((req = acc_req) != NULL) {
+ while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk;
- acc_req = req->dl_next;
-
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != req->rsk_listener);
-
- /* Paranoid, to prevent race condition if
- * an inbound pkt destined for child is
- * blocked by sock lock in tcp_v4_rcv().
- * Also to satisfy an assertion in
- * tcp_v4_destroy_sock().
- */
- tcp_sk(child)->fastopen_rsk = NULL;
- }
- inet_csk_destroy_sock(child);
-
+ inet_child_forget(sk, req, child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- sk_acceptq_removed(sk);
- reqsk_put(req);
+ cond_resched();
}
- if (queue->fastopenq) {
+ if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */
- spin_lock_bh(&queue->fastopenq->lock);
- acc_req = queue->fastopenq->rskq_rst_head;
- queue->fastopenq->rskq_rst_head = NULL;
- spin_unlock_bh(&queue->fastopenq->lock);
- while ((req = acc_req) != NULL) {
- acc_req = req->dl_next;
+ spin_lock_bh(&queue->fastopenq.lock);
+ req = queue->fastopenq.rskq_rst_head;
+ queue->fastopenq.rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq.lock);
+ while (req != NULL) {
+ next = req->dl_next;
reqsk_put(req);
+ req = next;
}
}
- WARN_ON(sk->sk_ack_backlog);
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/kernel/net/ipv4/inet_diag.c b/kernel/net/ipv4/inet_diag.c
index 4d32262c7..ab9f8a666 100644
--- a/kernel/net/ipv4/inet_diag.c
+++ b/kernel/net/ipv4/inet_diag.c
@@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (nla_put_u8(skb, INET_DIAG_TCLASS,
inet6_sk(sk)->tclass) < 0)
goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
}
#endif
@@ -200,9 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
#undef EXPIRES_IN_MS
- if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
attr = nla_reserve(skb, INET_DIAG_INFO,
- sizeof(struct tcp_info));
+ handler->idiag_info_size);
if (!attr)
goto errout;
@@ -726,91 +730,21 @@ static void twsk_build_assert(void)
#endif
}
-static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_entry entry;
- int j, s_j, reqnum, s_reqnum;
- struct listen_sock *lopt;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-
- lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !listen_sock_qlen(lopt))
- goto out;
-
- if (bc) {
- entry.sport = inet->inet_num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < lopt->nr_table_entries; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.idiag_dport != ireq->ir_rmt_port &&
- r->id.idiag_dport)
- continue;
-
- if (bc) {
- /* Note: entry.sport and entry.userlocks are already set */
- entry_fill_addrs(&entry, req_to_sk(req));
- entry.dport = ntohs(ireq->ir_rmt_port);
-
- if (!inet_diag_bc_run(bc, &entry))
- continue;
- }
-
- err = inet_req_diag_fill(req_to_sk(req), skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return err;
-}
-
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
+ u32 idiag_states = r->idiag_states;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
@@ -840,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (!(r->idiag_states & TCPF_LISTEN) ||
- r->id.idiag_dport ||
+ if (r->id.idiag_dport ||
cb->args[3] > 0)
- goto syn_recv;
-
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
- spin_unlock_bh(&ilb->lock);
- goto done;
- }
-
-syn_recv:
- if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -875,7 +799,7 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
- if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & ~TCPF_LISTEN))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
@@ -902,7 +826,7 @@ skip_listen_ht:
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
- if (!(r->idiag_states & (1 << state)))
+ if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
@@ -1078,14 +1002,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
+static
+int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
+{
+ const struct inet_diag_handler *handler;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ struct inet_diag_msg *r;
+ void *info = NULL;
+ int err = 0;
+
+ nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
+ if (!nlh)
+ return -ENOMEM;
+
+ r = nlmsg_data(nlh);
+ memset(r, 0, sizeof(*r));
+ inet_diag_msg_common_fill(r, sk);
+ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
+ r->id.idiag_sport = inet_sk(sk)->inet_sport;
+ r->idiag_state = sk->sk_state;
+
+ if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ handler = inet_diag_lock_handler(sk->sk_protocol);
+ if (IS_ERR(handler)) {
+ inet_diag_unlock_handler(handler);
+ nlmsg_cancel(skb, nlh);
+ return PTR_ERR(handler);
+ }
+
+ attr = handler->idiag_info_size
+ ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
+ : NULL;
+ if (attr)
+ info = nla_data(attr);
+
+ handler->idiag_get_info(sk, r, info);
+ inet_diag_unlock_handler(handler);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
+ .get_info = inet_diag_handler_get_info,
};
int inet_diag_register(const struct inet_diag_handler *h)
diff --git a/kernel/net/ipv4/inet_fragment.c b/kernel/net/ipv4/inet_fragment.c
index 5e346a082..fe144dae7 100644
--- a/kernel/net/ipv4/inet_fragment.c
+++ b/kernel/net/ipv4/inet_fragment.c
@@ -131,34 +131,22 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
unsigned int evicted = 0;
HLIST_HEAD(expired);
-evict_again:
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;
- if (!del_timer(&fq->timer)) {
- /* q expiring right now thus increment its refcount so
- * it won't be freed under us and wait until the timer
- * has finished executing then destroy it
- */
- atomic_inc(&fq->refcnt);
- spin_unlock(&hb->chain_lock);
- del_timer_sync(&fq->timer);
- inet_frag_put(fq, f);
- goto evict_again;
- }
+ if (!del_timer(&fq->timer))
+ continue;
- fq->flags |= INET_FRAG_EVICTED;
- hlist_del(&fq->list);
- hlist_add_head(&fq->list, &expired);
+ hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}
spin_unlock(&hb->chain_lock);
- hlist_for_each_entry_safe(fq, n, &expired, list)
+ hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq);
return evicted;
@@ -221,12 +209,6 @@ int inet_frags_init(struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frags_init);
-void inet_frags_init_net(struct netns_frags *nf)
-{
- init_frag_mem_limit(nf);
-}
-EXPORT_SYMBOL(inet_frags_init_net);
-
void inet_frags_fini(struct inet_frags *f)
{
cancel_work_sync(&f->frags_work);
@@ -240,18 +222,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
int i;
nf->low_thresh = 0;
- local_bh_disable();
evict_again:
+ local_bh_disable();
seq = read_seqbegin(&f->rnd_seqlock);
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
inet_evict_bucket(f, &f->hash[i]);
- if (read_seqretry(&f->rnd_seqlock, seq))
- goto evict_again;
-
local_bh_enable();
+ cond_resched();
+
+ if (read_seqretry(&f->rnd_seqlock, seq) ||
+ percpu_counter_sum(&nf->mem))
+ goto evict_again;
percpu_counter_destroy(&nf->mem);
}
@@ -284,8 +268,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
struct inet_frag_bucket *hb;
hb = get_frag_bucket_locked(fq, f);
- if (!(fq->flags & INET_FRAG_EVICTED))
- hlist_del(&fq->list);
+ hlist_del(&fq->list);
+ fq->flags |= INET_FRAG_COMPLETE;
spin_unlock(&hb->chain_lock);
}
@@ -297,7 +281,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->flags |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -330,11 +313,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
fp = xp;
}
sum = sum_truesize + f->qsize;
- sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
+
+ sub_frag_mem_limit(nf, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
@@ -390,7 +374,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
q->net = nf;
f->constructor(q, arg);
- add_frag_mem_limit(q, f->qsize);
+ add_frag_mem_limit(nf, f->qsize);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
diff --git a/kernel/net/ipv4/inet_hashtables.c b/kernel/net/ipv4/inet_hashtables.c
index c6fb80bd5..ccc598079 100644
--- a/kernel/net/ipv4/inet_hashtables.c
+++ b/kernel/net/ipv4/inet_hashtables.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
+#include <linux/vmalloc.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
@@ -90,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
- atomic_inc(&hashinfo->bsockets);
-
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
tb->num_owners++;
@@ -111,8 +108,6 @@ static void __inet_put_port(struct sock *sk)
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
- atomic_dec(&hashinfo->bsockets);
-
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
@@ -131,7 +126,7 @@ void inet_put_port(struct sock *sk)
}
EXPORT_SYMBOL(inet_put_port);
-int __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num;
@@ -142,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (unlikely(!tb)) {
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
if (tb->port != port) {
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
@@ -190,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
@@ -348,7 +349,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
- int twrefcnt = 0;
spin_lock(lock);
@@ -376,21 +376,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
- twrefcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
- if (twrefcnt)
- inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} else if (tw) {
/* Silly. Should hash-dance instead... */
- inet_twsk_deschedule(tw);
-
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
}
return 0;
@@ -399,23 +395,27 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static inline u32 inet_sk_port_offset(const struct sock *sk)
+static u32 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
+
return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
inet->inet_daddr,
inet->inet_dport);
}
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+/* insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
- int twrefcnt = 0;
+ bool ret = true;
- WARN_ON(!sk_unhashed(sk));
+ WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
@@ -423,25 +423,41 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock);
- __sk_nulls_add_node_rcu(sk, list);
- if (tw) {
- WARN_ON(sk->sk_hash != tw->tw_hash);
- twrefcnt = inet_twsk_unhash(tw);
+ if (osk) {
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_del_node_init_rcu(osk);
}
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
spin_unlock(lock);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- return twrefcnt;
+ return ret;
}
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+{
+ bool ok = inet_ehash_insert(sk, osk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ } else {
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ sk->sk_state = TCP_CLOSE;
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_csk_destroy_sock(sk);
+ }
+ return ok;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+
+void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, tw);
-
+ if (sk->sk_state != TCP_LISTEN) {
+ inet_ehash_nolisten(sk, osk);
+ return;
+ }
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -449,7 +465,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
- return 0;
}
EXPORT_SYMBOL(__inet_hash);
@@ -496,7 +511,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
- int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
@@ -507,8 +521,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
+ /* By starting with offset being an even number,
+ * we tend to leave about 50% of ports for other uses,
+ * like bind(0).
+ */
+ offset &= ~1;
+
local_bh_disable();
- for (i = 1; i <= remaining; i++) {
+ for (i = 0; i < remaining; i++) {
port = low + (i + offset) % remaining;
if (inet_is_local_reserved_port(net, port))
continue;
@@ -552,25 +572,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL;
ok:
- hint += i;
+ hint += (i + 2) & ~1;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += __inet_hash_nolisten(sk, tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
- if (tw) {
- inet_twsk_deschedule(tw);
- while (twrefcnt) {
- twrefcnt--;
- inet_twsk_put(tw);
- }
- }
+ if (tw)
+ inet_twsk_deschedule_put(tw);
ret = 0;
goto out;
@@ -580,7 +595,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash_nolisten(sk, NULL);
+ inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -599,7 +614,11 @@ out:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+ u32 port_offset = 0;
+
+ if (!inet_sk(sk)->inet_num)
+ port_offset = inet_sk_port_offset(sk);
+ return __inet_hash_connect(death_row, sk, port_offset,
__inet_check_established);
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
@@ -608,7 +627,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;
- atomic_set(&h->bsockets, 0);
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
@@ -616,3 +634,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+ unsigned int locksz = sizeof(spinlock_t);
+ unsigned int i, nblocks = 1;
+
+ if (locksz != 0) {
+ /* allocate 2 cache lines or at least one spinlock per cpu */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
+ nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+ /* no more locks than number of hash buckets */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ hashinfo->ehash_locks = kmalloc_array(nblocks, locksz,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!hashinfo->ehash_locks)
+ hashinfo->ehash_locks = vmalloc(nblocks * locksz);
+
+ if (!hashinfo->ehash_locks)
+ return -ENOMEM;
+
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&hashinfo->ehash_locks[i]);
+ }
+ hashinfo->ehash_locks_mask = nblocks - 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
diff --git a/kernel/net/ipv4/inet_timewait_sock.c b/kernel/net/ipv4/inet_timewait_sock.c
index 00ec8d5d7..c67f9bd76 100644
--- a/kernel/net/ipv4/inet_timewait_sock.c
+++ b/kernel/net/ipv4/inet_timewait_sock.c
@@ -18,28 +18,6 @@
/**
- * inet_twsk_unhash - unhash a timewait socket from established hash
- * @tw: timewait socket
- *
- * unhash a timewait socket from established hash, if hashed.
- * ehash lock must be held by caller.
- * Returns 1 if caller should call inet_twsk_put() after lock release.
- */
-int inet_twsk_unhash(struct inet_timewait_sock *tw)
-{
- if (hlist_nulls_unhashed(&tw->tw_node))
- return 0;
-
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
-}
-
-/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
* @tw: timewait socket
* @hashinfo: hashinfo pointer
@@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
* bind hash lock must be held by caller.
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
-int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
- return 0;
+ return;
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
- /*
- * We cannot call inet_twsk_put() ourself under lock,
- * caller must call it for us.
- */
- return 1;
+ __sock_put((struct sock *)tw);
}
/* Must be called with locally disabled BHs. */
static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
- struct inet_bind_hashbucket *bhead;
- int refcnt;
- /* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+ struct inet_bind_hashbucket *bhead;
spin_lock(lock);
- refcnt = inet_twsk_unhash(tw);
+ sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
/* Disassociate with bind bucket. */
@@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
- atomic_sub(refcnt, &tw->tw_refcnt);
atomic_dec(&tw->tw_dr->tw_count);
inet_twsk_put(tw);
}
@@ -153,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
/*
* Step 2: Hash TW into tcp ehash chain.
* Notes :
- * - tw_refcnt is set to 3 because :
+ * - tw_refcnt is set to 4 because :
* - We have one reference from bhash chain.
* - We have one reference from ehash chain.
+ * - We have one reference from timer.
+ * - One reference for ourself (our caller will release it).
* We can use atomic_set() because prior spin_lock()/spin_unlock()
* committed into memory all tw fields.
*/
- atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ atomic_set(&tw->tw_refcnt, 4);
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
@@ -170,7 +142,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
-void tw_timer_handler(unsigned long data)
+static void tw_timer_handler(unsigned long data)
{
struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
@@ -235,15 +207,19 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
* tcp_input.c to verify this.
*/
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
if (del_timer_sync(&tw->tw_timer))
inet_twsk_kill(tw);
+ inet_twsk_put(tw);
}
-EXPORT_SYMBOL(inet_twsk_deschedule);
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
-void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
+void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
{
/* timeout := RTO * 3.5
*
@@ -271,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
*/
tw->tw_kill = timeo <= 4*HZ;
- if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
- atomic_inc(&tw->tw_refcnt);
+ if (!rearm) {
+ BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
atomic_inc(&tw->tw_dr->tw_count);
+ } else {
+ mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
-EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
@@ -311,9 +289,8 @@ restart:
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule(tw);
+ inet_twsk_deschedule_put(tw);
local_bh_enable();
- inet_twsk_put(tw);
goto restart_rcu;
}
/* If the nulls value we got at the end of this lookup is
diff --git a/kernel/net/ipv4/inetpeer.c b/kernel/net/ipv4/inetpeer.c
index 241afd743..86fa45809 100644
--- a/kernel/net/ipv4/inetpeer.c
+++ b/kernel/net/ipv4/inetpeer.c
@@ -157,22 +157,6 @@ void __init inet_initpeers(void)
INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
-static int addr_compare(const struct inetpeer_addr *a,
- const struct inetpeer_addr *b)
-{
- int i, n = (a->family == AF_INET ? 1 : 4);
-
- for (i = 0; i < n; i++) {
- if (a->addr.a6[i] == b->addr.a6[i])
- continue;
- if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
- return -1;
- return 1;
- }
-
- return 0;
-}
-
#define rcu_deref_locked(X, BASE) \
rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
@@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a,
*stackptr++ = &_base->root; \
for (u = rcu_deref_locked(_base->root, _base); \
u != peer_avl_empty;) { \
- int cmp = addr_compare(_daddr, &u->daddr); \
+ int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \
if (cmp == 0) \
break; \
if (cmp == -1) \
@@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
int count = 0;
while (u != peer_avl_empty) {
- int cmp = addr_compare(daddr, &u->daddr);
+ int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
if (cmp == 0) {
/* Before taking a reference, check if this entry was
* deleted (refcnt=-1)
diff --git a/kernel/net/ipv4/ip_forward.c b/kernel/net/ipv4/ip_forward.c
index 367448494..da0d7ce85 100644
--- a/kernel/net/ipv4/ip_forward.c
+++ b/kernel/net/ipv4/ip_forward.c
@@ -39,17 +39,21 @@
#include <net/route.h>
#include <net/xfrm.h>
-static bool ip_may_fragment(const struct sk_buff *skb)
-{
- return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
- skb->ignore_df;
-}
-
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
+ if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
+ return false;
+
+ /* original fragment exceeds mtu and DF is set */
+ if (unlikely(IPCB(skb)->frag_max_size > mtu))
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
return false;
@@ -57,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
}
-static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
@@ -77,6 +81,7 @@ int ip_forward(struct sk_buff *skb)
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
+ struct net *net;
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
@@ -95,6 +100,7 @@ int ip_forward(struct sk_buff *skb)
return NET_RX_SUCCESS;
skb_forward_csum(skb);
+ net = dev_net(skb->dev);
/*
* According to the RFC, we must first decrease the TTL field. If
@@ -114,8 +120,8 @@ int ip_forward(struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
- IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+ if (ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
goto drop;
@@ -139,8 +145,9 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, rt->dst.dev, ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, rt->dst.dev,
+ ip_forward_finish);
sr_failed:
/*
@@ -151,7 +158,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/kernel/net/ipv4/ip_fragment.c b/kernel/net/ipv4/ip_fragment.c
index cae22a1a8..b8a0607da 100644
--- a/kernel/net/ipv4/ip_fragment.c
+++ b/kernel/net/ipv4/ip_fragment.c
@@ -48,6 +48,7 @@
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
#include <net/inet_ecn.h>
+#include <net/l3mdev.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -75,7 +76,9 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
+ int vif; /* L3 master device index */
unsigned int rid;
struct inet_peer *peer;
};
@@ -98,6 +101,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct ip4_create_arg {
struct iphdr *iph;
u32 user;
+ int vif;
};
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
@@ -126,7 +130,8 @@ static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
+ qp->user == arg->user &&
+ qp->vif == arg->vif;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -143,9 +148,11 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
+ qp->vif = arg->vif;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ?
- inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+ NULL;
}
static void ip4_frag_free(struct inet_frag_queue *q)
@@ -173,6 +180,15 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
+static bool frag_expire_skip_icmp(u32 user)
+{
+ return user == IP_DEFRAG_AF_PACKET ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+ __IP_DEFRAG_CONNTRACK_IN_END) ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
+ __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
+}
+
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -192,7 +208,7 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+ if (!inet_frag_evicting(&qp->q)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
@@ -217,10 +233,8 @@ static void ip_expire(unsigned long arg)
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_AF_PACKET ||
- ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
- (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
- (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ if (frag_expire_skip_icmp(qp->user) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
@@ -236,7 +250,8 @@ out:
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
-static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+ u32 user, int vif)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
@@ -244,6 +259,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
arg.iph = iph;
arg.user = user;
+ arg.vif = vif;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
@@ -301,7 +317,7 @@ static int ip_frag_reinit(struct ipq *qp)
kfree_skb(fp);
fp = xp;
} while (fp);
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
@@ -319,6 +335,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
+ unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
@@ -446,7 +463,7 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- sub_frag_mem_limit(&qp->q, free_it->truesize);
+ sub_frag_mem_limit(qp->q.net, free_it->truesize);
kfree_skb(free_it);
}
}
@@ -470,13 +487,18 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(&qp->q, skb->truesize);
+ add_frag_mem_limit(qp->q.net, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
+ fragsize = skb->len + ihl;
+
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len + ihl > qp->q.max_size)
- qp->q.max_size = skb->len + ihl;
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
@@ -508,7 +530,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
- int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -573,47 +594,47 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(&qp->q, clone->truesize);
+ add_frag_mem_limit(qp->q.net, clone->truesize);
}
+ skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
-
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
+ head->truesize += fp->truesize;
}
- sub_frag_mem_limit(&qp->q, sum_truesize);
+ sub_frag_mem_limit(qp->q.net, head->truesize);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = qp->q.max_size;
+ IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(head);
- /* max_size != 0 implies at least one fragment had IP_DF set */
- iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
+ /* When we set IP_DF on a refragmented skb we must also force a
+ * call to ip_fragment to avoid forwarding a DF-skb of size s while
+ * original sender only sent fragments of size f (where f < s).
+ *
+ * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+ * frag seen to avoid sending tiny DF-fragments in case skb was built
+ * from one very small df-fragment and one large non-df frag.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
+ }
+
ip_send_check(iph);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -633,16 +654,17 @@ out_fail:
}
/* Process an incoming IP datagram fragment. */
-int ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
+ struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+ int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
- struct net *net;
- net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ skb_orphan(skb);
/* Lookup (or create) queue header */
- qp = ip_find(net, ip_hdr(skb), user);
+ qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret;
@@ -661,7 +683,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
int netoff;
@@ -690,7 +712,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
if (pskb_trim_rcsum(skb, netoff + len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- if (ip_defrag(skb, user))
+ if (ip_defrag(net, skb, user))
return NULL;
skb_clear_hash(skb);
}
@@ -818,6 +840,8 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
+ int res;
+
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -841,9 +865,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/
net->ipv4.frags.timeout = IP_FRAG_TIME;
- inet_frags_init_net(&net->ipv4.frags);
-
- return ip4_frags_ns_ctl_register(net);
+ res = inet_frags_init_net(&net->ipv4.frags);
+ if (res)
+ return res;
+ res = ip4_frags_ns_ctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&net->ipv4.frags);
+ return res;
}
static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/kernel/net/ipv4/ip_gre.c b/kernel/net/ipv4/ip_gre.c
index 5fd706473..614521437 100644
--- a/kernel/net/ipv4/ip_gre.c
+++ b/kernel/net/ipv4/ip_gre.c
@@ -25,6 +25,7 @@
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -47,6 +48,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/gre.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -121,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static int ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+static __be16 gre_flags_to_tnl_flags(__be16 flags)
+{
+ __be16 tflags = 0;
+
+ if (flags & GRE_CSUM)
+ tflags |= TUNNEL_CSUM;
+ if (flags & GRE_ROUTING)
+ tflags |= TUNNEL_ROUTING;
+ if (flags & GRE_KEY)
+ tflags |= TUNNEL_KEY;
+ if (flags & GRE_SEQ)
+ tflags |= TUNNEL_SEQ;
+ if (flags & GRE_STRICT)
+ tflags |= TUNNEL_STRICT;
+ if (flags & GRE_REC)
+ tflags |= TUNNEL_REC;
+ if (flags & GRE_VERSION)
+ tflags |= TUNNEL_VERSION;
+
+ return tflags;
+}
+
+static __be16 tnl_flags_to_gre_flags(__be16 tflags)
+{
+ __be16 flags = 0;
+
+ if (tflags & TUNNEL_CSUM)
+ flags |= GRE_CSUM;
+ if (tflags & TUNNEL_ROUTING)
+ flags |= GRE_ROUTING;
+ if (tflags & TUNNEL_KEY)
+ flags |= GRE_KEY;
+ if (tflags & TUNNEL_SEQ)
+ flags |= GRE_SEQ;
+ if (tflags & TUNNEL_STRICT)
+ flags |= GRE_STRICT;
+ if (tflags & TUNNEL_REC)
+ flags |= GRE_REC;
+ if (tflags & TUNNEL_VERSION)
+ flags |= GRE_VERSION;
+
+ return flags;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static void ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return PACKET_RCVD;
+ return;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return PACKET_RCVD;
+ return;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
break;
}
break;
+
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return PACKET_RCVD;
+ return;
break;
case ICMP_REDIRECT:
@@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
iph->daddr, iph->saddr, tpi->key);
if (!t)
- return PACKET_REJECT;
+ return;
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return PACKET_RCVD;
+ return;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return PACKET_RCVD;
+ return;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
- return PACKET_RCVD;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ ipgre_err(skb, info, &tpi);
+}
+
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
}
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
struct net *net = dev_net(skb->dev);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
@@ -218,40 +399,211 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
if (tunnel) {
skb_pop_mac_header(skb);
- ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ if (tunnel->collect_md) {
+ __be16 flags;
+ __be64 tun_id;
+
+ flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ tun_id = key_to_tunnel_id(tpi->key);
+ tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return PACKET_REJECT;
+ }
+
+ ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
return PACKET_REJECT;
}
+static int gre_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+ __be16 proto, __be32 key, __be32 seq)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(flags);
+ greh->protocol = proto;
+
+ if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (flags & TUNNEL_SEQ) {
+ *ptr = seq;
+ ptr--;
+ }
+ if (flags & TUNNEL_KEY) {
+ *ptr = key;
+ ptr--;
+ }
+ if (flags & TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct tnl_ptk_info tpi;
- tpi.flags = tunnel->parms.o_flags;
- tpi.proto = proto;
- tpi.key = tunnel->parms.o_key;
if (tunnel->parms.o_flags & TUNNEL_SEQ)
tunnel->o_seqno++;
- tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
- gre_build_header(skb, &tpi, tunnel->tun_hlen);
-
- skb_set_inner_protocol(skb, tpi.proto);
+ build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
+static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
+ bool csum)
+{
+ return iptunnel_handle_offloads(skb, csum,
+ csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
+static struct rtable *gre_get_rt(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl,
+ const struct ip_tunnel_key *key)
+{
+ struct net *net = dev_net(dev);
+
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
+ fl->flowi4_mark = skb->mark;
+ fl->flowi4_proto = IPPROTO_GRE;
+
+ return ip_route_output_key(net, fl);
+}
+
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df, flags;
+ int err;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ rt = gre_get_rt(skb, dev, &fl, key);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+
+ tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + tunnel_hlen + sizeof(struct iphdr);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ /* Push Tunnel header. */
+ skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb)) {
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+}
+
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+
+ rt = gre_get_rt(skb, dev, &fl4, &info->key);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+ info->key.u.ipv4.src = fl4.saddr;
+ return 0;
+}
+
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
if (dev->header_ops) {
/* Need space for new headers */
if (skb_cow_head(skb, dev->needed_headroom -
@@ -277,7 +629,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
goto out;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
-
return NETDEV_TX_OK;
free_skb:
@@ -292,6 +643,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ if (tunnel->collect_md) {
+ gre_fb_xmit(skb, dev);
+ return NETDEV_TX_OK;
+ }
+
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
@@ -300,7 +656,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
goto free_skb;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
-
return NETDEV_TX_OK;
free_skb:
@@ -530,10 +885,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
-static struct gre_cisco_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
- .priority = 0,
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
};
static int __net_init ipgre_init_net(struct net *net)
@@ -596,8 +950,10 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -635,6 +991,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_GRE_COLLECT_METADATA]) {
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ t->collect_md = true;
+ }
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -688,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -712,7 +1075,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -730,7 +1093,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipgre_netlink_parms(data, tb, &p);
+ ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_changelink(dev, tb, &p);
}
@@ -765,6 +1128,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_GRE_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -796,6 +1161,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags))
goto nla_put_failure;
+ if (t->collect_md) {
+ if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
@@ -817,6 +1187,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -849,9 +1220,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+
+ dev = rtnl_create_link(net, name, name_assign_type,
+ &ipgre_tap_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ /* Configure flow based GRE device. */
+ t = netdev_priv(dev);
+ t->collect_md = true;
+
+ err = ipgre_newlink(net, dev, tb, NULL);
+ if (err < 0)
+ goto out;
+ return dev;
+out:
+ free_netdev(dev);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
+
static int __net_init ipgre_tap_init_net(struct net *net)
{
- return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
static void __net_exit ipgre_tap_exit_net(struct net *net)
@@ -881,7 +1281,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto pnet_tap_faied;
- err = gre_cisco_register(&ipgre_protocol);
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
@@ -900,7 +1300,7 @@ static int __init ipgre_init(void)
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied:
@@ -912,7 +1312,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- gre_cisco_unregister(&ipgre_protocol);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/kernel/net/ipv4/ip_input.c b/kernel/net/ipv4/ip_input.c
index 2db4c8773..b1209b633 100644
--- a/kernel/net/ipv4/ip_input.c
+++ b/kernel/net/ipv4/ip_input.c
@@ -146,6 +146,7 @@
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <net/dst_metadata.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -156,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
@@ -166,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), dev_net(dev))) {
+ net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
}
if (last) {
@@ -187,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
-
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
@@ -247,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb)
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -310,7 +311,7 @@ drop:
int sysctl_ip_early_demux __read_mostly = 1;
EXPORT_SYMBOL(sysctl_ip_early_demux);
-static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
@@ -331,13 +332,12 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (!skb_dst(skb)) {
+ if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
- NET_INC_STATS_BH(dev_net(skb->dev),
- LINUX_MIB_IPRPFILTER);
+ NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
}
@@ -358,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
return dst_input(skb);
@@ -377,6 +375,7 @@ drop:
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
+ struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -386,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+ net = dev_net(dev);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -416,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(dev_net(dev),
+ IP_ADD_STATS_BH(net,
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
@@ -430,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
@@ -440,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -452,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
ip_rcv_finish);
csum_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
diff --git a/kernel/net/ipv4/ip_output.c b/kernel/net/ipv4/ip_output.c
index c65b93a7b..49f028563 100644
--- a/kernel/net/ipv4/ip_output.c
+++ b/kernel/net/ipv4/ip_output.c
@@ -83,6 +83,11 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
+
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
@@ -91,32 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
- skb_dst(skb)->dev, dst_output_sk);
-}
-
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
}
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output_sk(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -131,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
* Add an ip header to a skbuff and send it out.
*
*/
-int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -145,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->dst))
- iph->frag_off = htons(IP_DF);
- else
- iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(sock_net(sk), skb, sk);
+ if (ip_dont_fragment(sk, &rt->dst)) {
+ iph->frag_off = htons(IP_DF);
+ iph->id = 0;
+ } else {
+ iph->frag_off = 0;
+ __ip_select_ident(net, iph, 1);
+ }
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -164,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
@@ -178,9 +182,9 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -216,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -224,8 +229,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
- return ip_finish_output2(sk, skb);
+ skb_gso_network_seglen(skb) <= mtu)
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -235,6 +240,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
* from host network stack.
*/
features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -248,7 +254,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -258,25 +264,28 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
return ret;
}
-static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
+ mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb);
+ return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > ip_skb_dst_mtu(skb))
- return ip_fragment(sk, skb, ip_finish_output2);
+ if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -284,7 +293,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -312,7 +321,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- sk, newskb, NULL, newskb->dev,
+ net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
}
@@ -327,26 +336,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
- NULL, newskb->dev, dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
}
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
- skb->dev, ip_finish_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -369,6 +380,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -399,7 +411,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -436,20 +448,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -478,6 +490,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(net, sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(net, sk, skb, output);
+}
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -485,8 +519,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -500,6 +534,11 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
dev = rt->dst.dev;
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
/*
* Point into the IP datagram header.
*/
@@ -507,15 +546,8 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
@@ -523,10 +555,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge)
- mtu -= nf_bridge_mtu_reduction(skb);
-#endif
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
@@ -599,10 +627,10 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
@@ -612,7 +640,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
}
if (err == 0) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}
@@ -621,7 +649,7 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
kfree_skb(frag);
frag = skb;
}
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
@@ -635,9 +663,6 @@ slow_path_clean:
}
slow_path:
- /* for offloaded checksums cleanup checksum before fragmentation */
- if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
- goto fail;
iph = ip_hdr(skb);
left = skb->len - hlen; /* Space per frame */
@@ -711,6 +736,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -736,22 +764,22 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}
consume_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
@@ -886,6 +914,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -893,7 +922,7 @@ static int __ip_append_data(struct sock *sk,
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
@@ -1217,11 +1246,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
}
while (size > 0) {
- int i;
-
- if (skb_is_gso(skb))
+ if (skb_is_gso(skb)) {
len = size;
- else {
+ } else {
/* Check if the remaining data fits into current packet. */
len = mtu - skb->len;
@@ -1273,15 +1300,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
continue;
}
- i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
- } else if (i < MAX_SKB_FRAGS) {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, len);
- } else {
+
+ if (skb_append_pagefrags(skb, page, offset, len)) {
err = -EMSGSIZE;
goto error;
}
@@ -1416,7 +1438,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
@@ -1524,6 +1546,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
struct net *net = sock_net(sk);
struct sk_buff *nskb;
int err;
+ int oif;
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
return;
@@ -1541,7 +1564,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if,
+ oif = arg->bound_dev_if;
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+ oif = skb->skb_iif;
+
+ flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
@@ -1573,7 +1600,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/kernel/net/ipv4/ip_sockglue.c b/kernel/net/ipv4/ip_sockglue.c
index 6ddde8999..a50124260 100644
--- a/kernel/net/ipv4/ip_sockglue.c
+++ b/kernel/net/ipv4/ip_sockglue.c
@@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+
+ /* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
err < 40 ? err : 40);
if (err)
@@ -591,6 +593,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TRANSPARENT:
case IP_MINTTL:
case IP_NODEFRAG:
+ case IP_BIND_ADDRESS_NO_PORT:
case IP_UNICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_ALL:
@@ -741,6 +744,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
inet->nodefrag = val ? 1 : 0;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet->bind_address_no_port = val ? 1 : 0;
+ break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
@@ -1247,11 +1253,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
* the _received_ ones. The set sets the _sent_ ones.
*/
+static bool getsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_MSFILTER:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
+
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
- int val;
+ bool needs_rtnl = getsockopt_needs_rtnl(optname);
+ int val, err = 0;
int len;
if (level != SOL_IP)
@@ -1265,6 +1282,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (len < 0)
return -EINVAL;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -1333,6 +1352,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_NODEFRAG:
val = inet->nodefrag;
break;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet->bind_address_no_port;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
@@ -1379,39 +1401,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MSFILTER:
{
struct ip_msfilter msf;
- int err;
if (len < IP_MSFILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_msfget(sk, &msf,
(struct ip_msfilter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case MCAST_MSFILTER:
{
struct group_filter gsf;
- int err;
if (len < GROUP_FILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_gsfget(sk, &gsf,
(struct group_filter __user *)optval,
optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case IP_MULTICAST_ALL:
val = inet->mc_all;
@@ -1478,6 +1496,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
}
return 0;
+
+out:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
}
int ip_getsockopt(struct sock *sk, int level,
diff --git a/kernel/net/ipv4/ip_tunnel.c b/kernel/net/ipv4/ip_tunnel.c
index 626d9e56a..cbb51f3fa 100644
--- a/kernel/net/ipv4/ip_tunnel.c
+++ b/kernel/net/ipv4/ip_tunnel.c
@@ -230,10 +230,13 @@ skip_key_lookup:
if (cand)
return cand;
+ t = rcu_dereference(itn->collect_md_tun);
+ if (t)
+ return t;
+
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
return netdev_priv(itn->fb_tunnel_dev);
-
return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
struct hlist_head *head = ip_bucket(itn, &t->parms);
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, t);
hlist_add_head_rcu(&t->hash_node, head);
}
-static void ip_tunnel_del(struct ip_tunnel *t)
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, NULL);
hlist_del_init_rcu(&t->hash_node);
}
@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
}
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
- const struct tnl_ptk_info *tpi, bool log_ecn_error)
+ const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+ bool log_ecn_error)
{
struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev;
}
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *p,
bool set_mtu)
{
- ip_tunnel_del(t);
+ ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
t->parms.i_key = p->i_key;
@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) {
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
unregister_netdevice_queue(dev, head);
}
}
@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id);
- if (ip_tunnel_find(itn, p, dev->type))
- return -EEXIST;
+ if (nt->collect_md) {
+ if (rtnl_dereference(itn->collect_md_tun))
+ return -EEXIST;
+ } else {
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+ }
nt->net = net;
nt->parms = *p;
@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
dev->mtu = mtu;
ip_tunnel_add(itn, nt);
-
out:
return err;
}
@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4;
iph->ihl = 5;
+ if (tunnel->collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
itn = net_generic(net, tunnel->ip_tnl_net_id);
/* fb_tunnel_dev will be unregisted in net-exit call. */
if (itn->fb_tunnel_dev != dev)
- ip_tunnel_del(netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
ip_tunnel_dst_reset_all(tunnel);
}
diff --git a/kernel/net/ipv4/ip_tunnel_core.c b/kernel/net/ipv4/ip_tunnel_core.c
index ce63ab21b..6cb9009c3 100644
--- a/kernel/net/ipv4/ip_tunnel_core.c
+++ b/kernel/net/ipv4/ip_tunnel_core.c
@@ -32,6 +32,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
+#include <linux/static_key.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -45,12 +46,14 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
+#include <net/dst_metadata.h>
int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
- int pkt_len = skb->len;
+ int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
@@ -74,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out_sk(sk, skb);
+ err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
@@ -98,7 +100,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
return -ENOMEM;
eh = (struct ethhdr *)skb->data;
- if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ if (likely(eth_proto_is_802_3(eh->h_proto)))
skb->protocol = eh->h_proto;
else
skb->protocol = htons(ETH_P_802_2);
@@ -118,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
+ gfp_t flags)
+{
+ struct metadata_dst *res;
+ struct ip_tunnel_info *dst, *src;
+
+ if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+ return NULL;
+
+ res = metadata_dst_alloc(0, flags);
+ if (!res)
+ return NULL;
+
+ dst = &res->u.tun_info;
+ src = &md->u.tun_info;
+ dst->key.tun_id = src->key.tun_id;
+ if (src->mode & IP_TUNNEL_INFO_IPV6)
+ memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ else
+ dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
+
struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
bool csum_help,
int gso_type_mask)
@@ -165,6 +194,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
{
int i;
+ netdev_stats_to_stats64(tot, &dev->stats);
+
for_each_possible_cpu(i) {
const struct pcpu_sw_netstats *tstats =
per_cpu_ptr(dev->tstats, i);
@@ -185,22 +216,211 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->tx_bytes += tx_bytes;
}
- tot->multicast = dev->stats.multicast;
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
- tot->rx_crc_errors = dev->stats.rx_crc_errors;
- tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
- tot->rx_length_errors = dev->stats.rx_length_errors;
- tot->rx_frame_errors = dev->stats.rx_frame_errors;
- tot->rx_errors = dev->stats.rx_errors;
+static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP_DST] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
+ int err;
- tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
- tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
- tot->tx_dropped = dev->stats.tx_dropped;
- tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
- tot->tx_errors = dev->stats.tx_errors;
+ err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy);
+ if (err < 0)
+ return err;
- tot->collisions = dev->stats.collisions;
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
- return tot;
+ new_state->type = LWTUNNEL_ENCAP_IP;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[LWTUNNEL_IP_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
+
+ if (tb[LWTUNNEL_IP_DST])
+ tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+
+ if (tb[LWTUNNEL_IP_SRC])
+ tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+
+ if (tb[LWTUNNEL_IP_TTL])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
+
+ if (tb[LWTUNNEL_IP_TOS])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
+
+ if (tb[LWTUNNEL_IP_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
+ nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
+ nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* LWTUNNEL_IP_ID */
+ + nla_total_size(4) /* LWTUNNEL_IP_DST */
+ + nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ + nla_total_size(1) /* LWTUNNEL_IP_TTL */
+ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */
+}
+
+static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ return memcmp(lwt_tun_info(a), lwt_tun_info(b),
+ sizeof(struct ip_tunnel_info));
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+ .build_state = ip_tun_build_state,
+ .fill_encap = ip_tun_fill_encap_info,
+ .get_encap_size = ip_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+};
+
+static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
+ [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
+ [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
+ [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+};
+
+static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct ip_tunnel_info *tun_info;
+ struct lwtunnel_state *new_state;
+ struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy);
+ if (err < 0)
+ return err;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ if (!new_state)
+ return -ENOMEM;
+
+ new_state->type = LWTUNNEL_ENCAP_IP6;
+
+ tun_info = lwt_tun_info(new_state);
+
+ if (tb[LWTUNNEL_IP6_ID])
+ tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]);
+
+ if (tb[LWTUNNEL_IP6_DST])
+ tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
+
+ if (tb[LWTUNNEL_IP6_SRC])
+ tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
+
+ if (tb[LWTUNNEL_IP6_HOPLIMIT])
+ tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
+
+ if (tb[LWTUNNEL_IP6_TC])
+ tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
+
+ if (tb[LWTUNNEL_IP6_FLAGS])
+ tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
+
+ tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
+ tun_info->options_len = 0;
+
+ *ts = new_state;
+
+ return 0;
+}
+
+static int ip6_tun_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
+ nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
+ nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ return nla_total_size(8) /* LWTUNNEL_IP6_ID */
+ + nla_total_size(16) /* LWTUNNEL_IP6_DST */
+ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ + nla_total_size(1) /* LWTUNNEL_IP6_TC */
+ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */
+}
+
+static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
+ .build_state = ip6_tun_build_state,
+ .fill_encap = ip6_tun_fill_encap_info,
+ .get_encap_size = ip6_tun_encap_nlsize,
+ .cmp_encap = ip_tun_cmp_encap,
+};
+
+void __init ip_tunnel_core_init(void)
+{
+ lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+ lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
+}
+
+struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+ static_key_slow_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+ static_key_slow_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/kernel/net/ipv4/ip_vti.c b/kernel/net/ipv4/ip_vti.c
index 0c152087c..4d8f0b698 100644
--- a/kernel/net/ipv4/ip_vti.c
+++ b/kernel/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
- err = dst_output(skb);
+ err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/kernel/net/ipv4/ipconfig.c b/kernel/net/ipv4/ipconfig.c
index 8e7328c6a..0bc7412d9 100644
--- a/kernel/net/ipv4/ipconfig.c
+++ b/kernel/net/ipv4/ipconfig.c
@@ -94,7 +94,7 @@
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
#define CONF_SEND_RETRIES 6 /* Send six requests per open */
-#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */
#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
@@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
/* Persistent data: */
static int ic_proto_used; /* Protocol used, if any */
@@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, vendor_class_identifier, len);
e += len;
}
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
+ }
}
*e++ = 255; /* End of the list */
@@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name)
return 0;
}
#ifdef CONFIG_IP_PNP_DHCP
- else if (!strcmp(name, "dhcp")) {
+ else if (!strncmp(name, "dhcp", 4)) {
+ char *client_id;
+
ic_proto_enabled &= ~IC_RARP;
+ client_id = strstr(name, "dhcp,");
+ if (client_id) {
+ char *v;
+
+ client_id = client_id + 5;
+ v = strchr(client_id, ',');
+ if (!v)
+ return 1;
+ *v = 0;
+ if (kstrtou8(client_id, 0, dhcp_client_identifier))
+ DBG("DHCP: Invalid client identifier type\n");
+ strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ *v = ',';
+ }
return 1;
}
#endif
diff --git a/kernel/net/ipv4/ipip.c b/kernel/net/ipv4/ipip.c
index ff96396eb..a09fb0dec 100644
--- a/kernel/net/ipv4/ipip.c
+++ b/kernel/net/ipv4/ipip.c
@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
return -1;
@@ -251,10 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EINVAL;
}
- p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
+ p.i_key = p.o_key = 0;
+ p.i_flags = p.o_flags = 0;
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
diff --git a/kernel/net/ipv4/ipmr.c b/kernel/net/ipv4/ipmr.c
index 3a2c0162c..c3a38353f 100644
--- a/kernel/net/ipv4/ipmr.c
+++ b/kernel/net/ipv4/ipmr.c
@@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void mroute_clean_tables(struct mr_table *mrt);
+static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.match = ipmr_rule_match,
.configure = ipmr_rule_configure,
.compare = ipmr_rule_compare,
- .default_pref = fib_default_rule_pref,
.fill = ipmr_rule_fill,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = ipmr_rule_policy,
@@ -351,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -442,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -541,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -1209,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr_table *mrt)
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
@@ -1218,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags & VIFF_STATIC))
- vif_delete(mrt, i, 0, &list);
+ if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+ continue;
+ vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
@@ -1227,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
@@ -1262,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
}
}
rtnl_unlock();
@@ -1679,17 +1671,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
nf_reset(skb);
}
-static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
+static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
@@ -1746,7 +1739,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* to blackhole.
*/
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
goto out_free;
}
@@ -1788,8 +1781,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dev,
ipmr_forward_finish);
return;
diff --git a/kernel/net/ipv4/netfilter.c b/kernel/net/ipv4/netfilter.c
index 65de0684e..c3776ff67 100644
--- a/kernel/net/ipv4/netfilter.c
+++ b/kernel/net/ipv4/netfilter.c
@@ -17,9 +17,8 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
@@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
}
}
-static int nf_ip_reroute(struct sk_buff *skb,
+static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(skb, RTN_UNSPEC);
+ return ip_route_me_harder(net, skb, RTN_UNSPEC);
}
return 0;
}
@@ -197,11 +196,4 @@ static int __init ipv4_netfilter_init(void)
{
return nf_register_afinfo(&nf_ip_afinfo);
}
-
-static void __exit ipv4_netfilter_fini(void)
-{
- nf_unregister_afinfo(&nf_ip_afinfo);
-}
-
-module_init(ipv4_netfilter_init);
-module_exit(ipv4_netfilter_fini);
+subsys_initcall(ipv4_netfilter_init);
diff --git a/kernel/net/ipv4/netfilter/Kconfig b/kernel/net/ipv4/netfilter/Kconfig
index fb20f3631..c187c60e3 100644
--- a/kernel/net/ipv4/netfilter/Kconfig
+++ b/kernel/net/ipv4/netfilter/Kconfig
@@ -58,6 +58,13 @@ config NFT_REJECT_IPV4
default NFT_REJECT
tristate
+config NFT_DUP_IPV4
+ tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV4
+ help
+ This module enables IPv4 packet duplication support for nf_tables.
+
endif # NF_TABLES_IPV4
config NF_TABLES_ARP
@@ -67,6 +74,13 @@ config NF_TABLES_ARP
endif # NF_TABLES
+config NF_DUP_IPV4
+ tristate "Netfilter IPv4 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ help
+ This option enables the nf_dup_ipv4 core, which duplicates an IPv4
+ packet to be rerouted to another destination.
+
config NF_LOG_ARP
tristate "ARP packet logging"
default m if NETFILTER_ADVANCED=n
@@ -195,7 +209,8 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE || IP_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/kernel/net/ipv4/netfilter/Makefile b/kernel/net/ipv4/netfilter/Makefile
index 7fe6c7035..87b073da1 100644
--- a/kernel/net/ipv4/netfilter/Makefile
+++ b/kernel/net/ipv4/netfilter/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
# generic IP tables
@@ -70,3 +71,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o
diff --git a/kernel/net/ipv4/netfilter/arp_tables.c b/kernel/net/ipv4/netfilter/arp_tables.c
index a61200754..11dccba47 100644
--- a/kernel/net/ipv4/netfilter/arp_tables.c
+++ b/kernel/net/ipv4/netfilter/arp_tables.c
@@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, arpinfo->iniface,
- arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
return 0;
}
@@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, arpinfo->outiface,
- arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
return 0;
}
@@ -240,23 +240,24 @@ get_entry(const void *base, unsigned int offset)
return (struct arpt_entry *)(base + offset);
}
-static inline __pure
+static inline
struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
{
return (void *)entry + entry->next_offset;
}
unsigned int arpt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
const struct arphdr *arp;
- struct arpt_entry *e, *back;
+ struct arpt_entry *e, **jumpstack;
const char *indev, *outdev;
- void *table_base;
+ const void *table_base;
+ unsigned int cpu, stackidx = 0;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
@@ -270,16 +271,21 @@ unsigned int arpt_do_table(struct sk_buff *skb,
local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private;
+ cpu = smp_processor_id();
/*
* Ensure we load private-> members after we've fetched the base
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[smp_processor_id()];
+ table_base = private->entries;
+ jumpstack = (struct arpt_entry **)private->jumpstack[cpu];
+ /* No TEE support for arptables, so no need to switch to alternate
+ * stack. All targets that reenter must return absolute verdicts.
+ */
e = get_entry(table_base, private->hook_entry[hook]);
- back = get_entry(table_base, private->underflow[hook]);
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.hooknum = hook;
@@ -289,13 +295,15 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
const struct xt_entry_target *t;
+ struct xt_counters *counter;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
continue;
}
- ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
t = arpt_get_target_c(e);
@@ -310,27 +318,24 @@ unsigned int arpt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- e = back;
- back = get_entry(table_base, back->comefrom);
+ if (stackidx == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ } else {
+ e = jumpstack[--stackidx];
+ e = arpt_next_entry(e);
+ }
continue;
}
if (table_base + v
!= arpt_next_entry(e)) {
- /* Save old back ptr in next entry */
- struct arpt_entry *next = arpt_next_entry(e);
- next->comefrom = (void *)back - table_base;
-
- /* set back pointer to next entry */
- back = next;
+ jumpstack[stackidx++] = e;
}
e = get_entry(table_base, v);
continue;
}
- /* Targets which reenter must return
- * abs. verdicts
- */
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
@@ -463,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -521,6 +526,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
@@ -538,6 +547,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -614,13 +625,14 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
static int translate_table(struct xt_table_info *newinfo, void *entry0,
- const struct arpt_replace *repl)
+ const struct arpt_replace *repl)
{
struct arpt_entry *iter;
unsigned int i;
@@ -702,12 +714,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -722,14 +728,16 @@ static void get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -774,7 +782,7 @@ static int copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
/* ... then copy entire thing ... */
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
@@ -863,16 +871,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct arpt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -884,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1037,7 +1045,7 @@ static int __do_replace(struct net *net, const char *name,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
cleanup_entry(iter);
@@ -1061,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name,
}
static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+ unsigned int len)
{
int ret;
struct arpt_replace tmp;
@@ -1084,8 +1092,7 @@ static int do_replace(struct net *net, const void __user *user,
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1115,7 +1122,7 @@ static int do_replace(struct net *net, const void __user *user,
static int do_add_counters(struct net *net, const void __user *user,
unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1125,7 +1132,6 @@ static int do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct arpt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1181,12 +1187,13 @@ static int do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
+
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1396,7 +1403,7 @@ static int translate_compat_table(const char *name,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1416,9 +1423,17 @@ static int translate_compat_table(const char *name,
i = 0;
xt_entry_foreach(iter1, entry1, newinfo->size) {
+ iter1->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(iter1->counters.pcnt)) {
+ ret = -ENOMEM;
+ break;
+ }
+
ret = check_target(iter1, name);
- if (ret != 0)
+ if (ret != 0) {
+ xt_percpu_counter_free(iter1->counters.pcnt);
break;
+ }
++i;
if (strcmp(arpt_get_target(iter1)->u.user.name,
XT_ERROR_TARGET) == 0)
@@ -1448,11 +1463,6 @@ static int translate_compat_table(const char *name,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1511,8 +1521,7 @@ static int compat_do_replace(struct net *net, void __user *user,
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1609,7 +1618,6 @@ static int compat_copy_entries_to_user(unsigned int total_size,
void __user *pos;
unsigned int size;
int ret = 0;
- void *loc_cpu_entry;
unsigned int i = 0;
struct arpt_entry *iter;
@@ -1617,11 +1625,9 @@ static int compat_copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy on our node/cpu */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -1790,8 +1796,7 @@ struct xt_table *arpt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1822,7 +1827,7 @@ void arpt_unregister_table(struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter);
if (private->number > private->initial_entries)
diff --git a/kernel/net/ipv4/netfilter/arptable_filter.c b/kernel/net/ipv4/netfilter/arptable_filter.c
index 93876d031..1897ee160 100644
--- a/kernel/net/ipv4/netfilter/arptable_filter.c
+++ b/kernel/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,10 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+arptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return arpt_do_table(skb, ops->hooknum, state,
- net->ipv4.arptable_filter);
+ return arpt_do_table(skb, state, state->net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/ip_tables.c b/kernel/net/ipv4/netfilter/ip_tables.c
index 2d0e265fe..b99affad6 100644
--- a/kernel/net/ipv4/netfilter/ip_tables.c
+++ b/kernel/net/ipv4/netfilter/ip_tables.c
@@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ipinfo->iniface,
- ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
return false;
}
@@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ipinfo->outiface,
- ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
return false;
}
@@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip,
FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
- ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
return false;
}
@@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
return 0;
}
-static void trace_packet(const struct sk_buff *skb,
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
@@ -254,15 +255,12 @@ static void trace_packet(const struct sk_buff *skb,
const struct xt_table_info *private,
const struct ipt_entry *e)
{
- const void *table_base;
const struct ipt_entry *root;
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
- struct net *net = dev_net(in ? in : out);
- table_base = private->entries[smp_processor_id()];
- root = get_entry(table_base, private->hook_entry[hook]);
+ root = get_entry(private->entries, private->hook_entry[hook]);
hookname = chainname = hooknames[hook];
comment = comments[NF_IP_TRACE_COMMENT_RULE];
@@ -278,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb,
}
#endif
-static inline __pure
+static inline
struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
{
return (void *)entry + entry->next_offset;
@@ -287,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
@@ -298,12 +296,13 @@ ipt_do_table(struct sk_buff *skb,
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
- unsigned int *stackptr, origptr, cpu;
+ unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
+ stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
@@ -316,6 +315,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.family = NFPROTO_IPV4;
@@ -331,20 +331,29 @@ ipt_do_table(struct sk_buff *skb,
* pointer.
*/
smp_read_barrier_depends();
- table_base = private->entries[cpu];
+ table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
- stackptr = per_cpu_ptr(private->stackptr, cpu);
- origptr = *stackptr;
+
+ /* Switch to alternate jumpstack if we're being invoked via TEE.
+ * TEE issues XT_CONTINUE verdict on original skb so we must not
+ * clobber the jumpstack.
+ *
+ * For recursion via REJECT or SYNPROXY the stack will be clobbered
+ * but it is no problem since absolute verdict is issued by these.
+ */
+ if (static_key_false(&xt_tee_enabled))
+ jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
e = get_entry(table_base, private->hook_entry[hook]);
- pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
- table->name, hook, origptr,
+ pr_debug("Entering %s(hook %u), UF %p\n",
+ table->name, hook,
get_entry(table_base, private->underflow[hook]));
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
+ struct xt_counters *counter;
IP_NF_ASSERT(e);
if (!ip_packet_match(ip, indev, outdev,
@@ -361,7 +370,8 @@ ipt_do_table(struct sk_buff *skb,
goto no_match;
}
- ADD_COUNTER(e->counters, skb->len, 1);
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -369,8 +379,8 @@ ipt_do_table(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
- trace_packet(skb, hook, state->in, state->out,
- table->name, private, e);
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
@@ -383,28 +393,24 @@ ipt_do_table(struct sk_buff *skb,
verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr <= origptr) {
+ if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
pr_debug("Underflow (this is normal) "
"to %p\n", e);
} else {
- e = jumpstack[--*stackptr];
+ e = jumpstack[--stackidx];
pr_debug("Pulled %p out from pos %u\n",
- e, *stackptr);
+ e, stackidx);
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
- if (*stackptr >= private->stacksize) {
- verdict = NF_DROP;
- break;
- }
- jumpstack[(*stackptr)++] = e;
+ jumpstack[stackidx++] = e;
pr_debug("Pushed %p into pos %u\n",
- e, *stackptr - 1);
+ e, stackidx - 1);
}
e = get_entry(table_base, v);
@@ -423,11 +429,10 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- pr_debug("Exiting %s; resetting sp from %u to %u\n",
- __func__, *stackptr, origptr);
- *stackptr = origptr;
- xt_write_recseq_end(addend);
- local_bh_enable();
+ pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
+
+ xt_write_recseq_end(addend);
+ local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -479,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- XT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -544,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -665,6 +670,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
if (ret)
return ret;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -691,6 +700,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
ret = check_target(e, net, name);
if (ret)
goto err;
+
return 0;
err:
module_put(t->u.kernel.target->me);
@@ -700,6 +710,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -784,13 +797,14 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
+ xt_percpu_counter_free(e->counters.pcnt);
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
- const struct ipt_replace *repl)
+ const struct ipt_replace *repl)
{
struct ipt_entry *iter;
unsigned int i;
@@ -866,12 +880,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i) {
- if (newinfo->entries[i] && newinfo->entries[i] != entry0)
- memcpy(newinfo->entries[i], entry0, newinfo->size);
- }
-
return ret;
}
@@ -887,14 +895,16 @@ get_counters(const struct xt_table_info *t,
seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0;
- xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ xt_entry_foreach(iter, t->entries, t->size) {
+ struct xt_counters *tmp;
u64 bcnt, pcnt;
unsigned int start;
+ tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
do {
start = read_seqcount_begin(s);
- bcnt = iter->counters.bcnt;
- pcnt = iter->counters.pcnt;
+ bcnt = tmp->bcnt;
+ pcnt = tmp->pcnt;
} while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -939,11 +949,7 @@ copy_entries_to_user(unsigned int total_size,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
@@ -1051,16 +1057,16 @@ static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
struct ipt_entry *iter;
- void *loc_cpu_entry;
+ const void *loc_cpu_entry;
int ret;
if (!newinfo || !info)
return -EINVAL;
- /* we dont care about newinfo->entries[] */
+ /* we dont care about newinfo->entries */
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
- loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ loc_cpu_entry = info->entries;
xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1072,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1181,7 +1187,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table *t;
struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_old_entry;
struct ipt_entry *iter;
ret = 0;
@@ -1224,8 +1229,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
@@ -1271,8 +1275,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1301,9 +1304,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
static int
do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+ unsigned int len, int compat)
{
- unsigned int i, curcpu;
+ unsigned int i;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1313,7 +1316,6 @@ do_add_counters(struct net *net, const void __user *user,
struct xt_table *t;
const struct xt_table_info *private;
int ret = 0;
- void *loc_cpu_entry;
struct ipt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT
@@ -1369,12 +1371,12 @@ do_add_counters(struct net *net, const void __user *user,
}
i = 0;
- /* Choose the copy that is on our node */
- curcpu = smp_processor_id();
- loc_cpu_entry = private->entries[curcpu];
addend = xt_write_recseq_begin();
- xt_entry_foreach(iter, loc_cpu_entry, private->size) {
- ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ xt_entry_foreach(iter, private->entries, private->size) {
+ struct xt_counters *tmp;
+
+ tmp = xt_get_this_cpu_counter(&iter->counters);
+ ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
xt_write_recseq_end(addend);
@@ -1444,7 +1446,6 @@ static int
compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
- unsigned int hookmask,
int *size)
{
struct xt_match *match;
@@ -1513,8 +1514,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
entry_offset = (void *)e - (void *)base;
j = 0;
xt_ematch_foreach(ematch, e) {
- ret = compat_find_calc_match(ematch, name,
- &e->ip, e->comefrom, &off);
+ ret = compat_find_calc_match(ematch, name, &e->ip, &off);
if (ret != 0)
goto release_matches;
++j;
@@ -1610,6 +1610,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
unsigned int j;
int ret = 0;
+ e->counters.pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(e->counters.pcnt))
+ return -ENOMEM;
+
j = 0;
mtpar.net = net;
mtpar.table = name;
@@ -1634,6 +1638,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
break;
cleanup_match(ematch, net);
}
+
+ xt_percpu_counter_free(e->counters.pcnt);
+
return ret;
}
@@ -1718,7 +1725,7 @@ translate_compat_table(struct net *net,
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
- entry1 = newinfo->entries[raw_smp_processor_id()];
+ entry1 = newinfo->entries;
pos = entry1;
size = total_size;
xt_entry_foreach(iter0, entry0, total_size) {
@@ -1770,11 +1777,6 @@ translate_compat_table(struct net *net,
return ret;
}
- /* And one copy for every other CPU */
- for_each_possible_cpu(i)
- if (newinfo->entries[i] && newinfo->entries[i] != entry1)
- memcpy(newinfo->entries[i], entry1, newinfo->size);
-
*pinfo = newinfo;
*pentry0 = entry1;
xt_free_table_info(info);
@@ -1821,8 +1823,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
@@ -1893,7 +1894,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
void __user *pos;
unsigned int size;
int ret = 0;
- const void *loc_cpu_entry;
unsigned int i = 0;
struct ipt_entry *iter;
@@ -1901,14 +1901,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
if (IS_ERR(counters))
return PTR_ERR(counters);
- /* choose the copy that is on our node/cpu, ...
- * This choice is lazy (because current thread is
- * allowed to migrate to another cpu)
- */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ xt_entry_foreach(iter, private->entries, total_size) {
ret = compat_copy_entry_to_user(iter, &pos,
&size, counters, i++);
if (ret != 0)
@@ -2083,8 +2078,7 @@ struct xt_table *ipt_register_table(struct net *net,
goto out;
}
- /* choose the copy on our node/cpu, but dont care about preemption */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2115,7 +2109,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
cleanup_entry(iter, net);
if (private->number > private->initial_entries)
diff --git a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 771ab3d01..4a9e6db9d 100644
--- a/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/kernel/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
struct clusterip_config *config;
int ret;
+ if (par->nft_compat) {
+ pr_err("cannot use CLUSTERIP target from nftables compat\n");
+ return -EOPNOTSUPP;
+ }
+
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
@@ -487,14 +492,14 @@ static void arp_print(struct arp_payload *payload)
{
#define HBUFFERLEN 30
char hbuffer[HBUFFERLEN];
- int j,k;
+ int j, k;
- for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
- hbuffer[k++]=':';
+ hbuffer[k++] = ':';
}
- hbuffer[--k]='\0';
+ hbuffer[--k] = '\0';
pr_debug("src %pI4@%s, dst %pI4\n",
&payload->src_ip, hbuffer, &payload->dst_ip);
@@ -502,14 +507,14 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(const struct nf_hook_ops *ops,
+arp_mangle(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
- struct net *net = dev_net(state->in ? state->in : state->out);
+ struct net *net = state->net;
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
diff --git a/kernel/net/ipv4/netfilter/ipt_ECN.c b/kernel/net/ipv4/netfilter/ipt_ECN.c
index 4bf3dc49a..270765236 100644
--- a/kernel/net/ipv4/netfilter/ipt_ECN.c
+++ b/kernel/net/ipv4/netfilter/ipt_ECN.c
@@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
tcph->cwr = einfo->proto.tcp.cwr;
inet_proto_csum_replace2(&tcph->check, skb,
- oldval, ((__be16 *)tcph)[6], 0);
+ oldval, ((__be16 *)tcph)[6], false);
return true;
}
diff --git a/kernel/net/ipv4/netfilter/ipt_REJECT.c b/kernel/net/ipv4/netfilter/ipt_REJECT.c
index 87907d4bd..1d16c0f28 100644
--- a/kernel/net/ipv4/netfilter/ipt_REJECT.c
+++ b/kernel/net/ipv4/netfilter/ipt_REJECT.c
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, hook);
+ nf_send_reset(par->net, skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
index e9e677930..5fdc55651 100644
--- a/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/kernel/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -18,7 +18,7 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct iphdr *
-synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct iphdr *iph;
@@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
}
static void
-synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+synproxy_send_tcp(const struct synproxy_net *snet,
+ const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
+ struct net *net = nf_ct_net(snet->tmpl);
+
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
nf_conntrack_get(nfct);
}
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
@@ -68,7 +71,8 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+synproxy_send_client_synack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
@@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
@@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
@@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
@@ -220,13 +224,14 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
- nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
}
static bool
@@ -257,7 +262,7 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_net *snet = synproxy_pernet(par->net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -286,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(skb, th, &opts);
+ synproxy_send_client_synack(snet, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
@@ -298,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+static unsigned int ipv4_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ struct synproxy_net *snet = synproxy_pernet(nhs->net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -432,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = {
static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
diff --git a/kernel/net/ipv4/netfilter/ipt_ah.c b/kernel/net/ipv4/netfilter/ipt_ah.c
index 14a2aa8b8..a787d07f6 100644
--- a/kernel/net/ipv4/netfilter/ipt_ah.c
+++ b/kernel/net/ipv4/netfilter/ipt_ah.c
@@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
bool r;
pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
invert ? '!' : ' ', min, spi, max);
- r=(spi >= min && spi <= max) ^ invert;
+ r = (spi >= min && spi <= max) ^ invert;
pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
diff --git a/kernel/net/ipv4/netfilter/ipt_rpfilter.c b/kernel/net/ipv4/netfilter/ipt_rpfilter.c
index 4bfaedf9b..78cc64edd 100644
--- a/kernel/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/kernel/net/ipv4/netfilter/ipt_rpfilter.c
@@ -32,15 +32,14 @@ static __be32 rpfilter_get_saddr(__be32 addr)
return addr;
}
-static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
bool dev_match;
- struct net *net = dev_net(dev);
int ret __maybe_unused;
- if (fib_lookup(net, fl4, &res))
+ if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
if (res.type != RTN_UNICAST) {
@@ -61,9 +60,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
if (FIB_RES_DEV(res) == dev)
dev_match = true;
#endif
- if (dev_match || flags & XT_RPFILTER_LOOSE)
- return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
- return dev_match;
+ return dev_match || flags & XT_RPFILTER_LOOSE;
}
static bool rpfilter_is_local(const struct sk_buff *skb)
@@ -98,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/kernel/net/ipv4/netfilter/iptable_filter.c b/kernel/net/ipv4/netfilter/iptable_filter.c
index a0f3beca5..397ef2dd1 100644
--- a/kernel/net/ipv4/netfilter/iptable_filter.c
+++ b/kernel/net/ipv4/netfilter/iptable_filter.c
@@ -33,19 +33,16 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_mangle.c b/kernel/net/ipv4/netfilter/iptable_mangle.c
index 62cbb8c5f..ba5d392a1 100644
--- a/kernel/net/ipv4/netfilter/iptable_mangle.c
+++ b/kernel/net/ipv4/netfilter/iptable_mangle.c
@@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = {
static unsigned int
ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
{
- struct net_device *out = state->out;
unsigned int ret;
const struct iphdr *iph;
u_int8_t tos;
@@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
- dev_net(out)->ipv4.iptable_mangle);
+ ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
@@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(const struct nf_hook_ops *ops,
+iptable_mangle_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (ops->hooknum == NF_INET_LOCAL_OUT)
+ if (state->hook == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, state);
- if (ops->hooknum == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->out)->ipv4.iptable_mangle);
+ if (state->hook == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, state,
+ state->net->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->in)->ipv4.iptable_mangle);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_nat.c b/kernel/net/ipv4/netfilter/iptable_nat.c
index 0d4d9cdf9..ae2cd2752 100644
--- a/kernel/net/ipv4/netfilter/iptable_nat.c
+++ b/kernel/net/ipv4/netfilter/iptable_nat.c
@@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = {
.af = NFPROTO_IPV4,
};
-static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
-
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
+ return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
-static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
@@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_local_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
@@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
diff --git a/kernel/net/ipv4/netfilter/iptable_raw.c b/kernel/net/ipv4/netfilter/iptable_raw.c
index 0356e6da4..1ba02811a 100644
--- a/kernel/net/ipv4/netfilter/iptable_raw.c
+++ b/kernel/net/ipv4/netfilter/iptable_raw.c
@@ -20,19 +20,16 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_raw_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/kernel/net/ipv4/netfilter/iptable_security.c b/kernel/net/ipv4/netfilter/iptable_security.c
index 4bce3980c..c2e23d5e9 100644
--- a/kernel/net/ipv4/netfilter/iptable_security.c
+++ b/kernel/net/ipv4/netfilter/iptable_security.c
@@ -37,20 +37,16 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_security_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state,
- net->ipv4.iptable_security);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -83,7 +79,7 @@ static int __init iptable_security_init(void)
int ret;
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0)
return ret;
sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 30ad9554b..461ca926f 100644
--- a/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+static unsigned int ipv4_helper(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
ct, ctinfo);
}
-static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -143,14 +143,14 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
-static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -280,7 +274,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
}
- h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (h) {
struct sockaddr_in sin;
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 80d5554b9..c567e1b5d 100644
--- a/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/kernel/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net)
}
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct icmphdr *hp;
struct icmphdr _hdr;
@@ -134,15 +134,17 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_zone tmp;
NF_CT_ASSERT(skb->nfct == NULL);
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb) + ip_hdrlen(skb)
+ sizeof(struct icmphdr),
- PF_INET, &origtuple)) {
+ PF_INET, net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT;
}
diff --git a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
index c88b7d434..a04dee536 100644
--- a/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,14 +22,13 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
{
int err;
- skb_orphan(skb);
-
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(net, skb, user);
local_bh_enable();
if (!err) {
@@ -43,33 +42,32 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
- u16 zone = NF_CT_DEFAULT_ZONE;
-
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct)
- zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-#endif
+ if (skb->nfct) {
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
- return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+ }
#endif
+ if (nf_bridge_in_prerouting(skb))
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
if (hooknum == NF_INET_PRE_ROUTING)
- return IP_DEFRAG_CONNTRACK_IN + zone;
+ return IP_DEFRAG_CONNTRACK_IN + zone_id;
else
- return IP_DEFRAG_CONNTRACK_OUT + zone;
+ return IP_DEFRAG_CONNTRACK_OUT + zone_id;
}
-static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (sk && (sk->sk_family == PF_INET) &&
- inet->nodefrag)
+ if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+ inet_sk(sk)->nodefrag)
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -83,9 +81,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
enum ip_defrag_users user =
- nf_ct_defrag_user(ops->hooknum, skb);
+ nf_ct_defrag_user(state->hook, skb);
- if (nf_ct_ipv4_gather_frags(skb, user))
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
@@ -94,14 +92,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
diff --git a/kernel/net/ipv4/netfilter/nf_dup_ipv4.c b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c
new file mode 100644
index 000000000..ceb187308
--- /dev/null
+++ b/kernel/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -0,0 +1,106 @@
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 or later, as
+ * published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
+ const struct in_addr *gw, int oif)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ memset(&fl4, 0, sizeof(fl4));
+ if (oif != -1)
+ fl4.flowi4_oif = oif;
+
+ fl4.daddr = gw->s_addr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return false;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = rt->dst.dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return true;
+}
+
+void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+ const struct in_addr *gw, int oif)
+{
+ struct iphdr *iph;
+
+ if (this_cpu_read(nf_skb_duplicated))
+ return;
+ /*
+ * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+ * the original skb, which should continue on its way as if nothing has
+ * happened. The copy should be independently delivered to the gateway.
+ */
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Avoid counting cloned packets towards the original connection. */
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+#endif
+ /*
+ * If we are in PREROUTING/INPUT, the checksum must be recalculated
+ * since the length could have changed as a result of defragmentation.
+ *
+ * We also decrease the TTL to mitigate potential loops between two
+ * hosts.
+ *
+ * Set %IP_DF so that the original source is notified of a potentially
+ * decreased MTU on the clone route. IPv6 does this too.
+ */
+ iph = ip_hdr(skb);
+ iph->frag_off |= htons(IP_DF);
+ if (hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_IN)
+ --iph->ttl;
+ ip_send_check(iph);
+
+ if (nf_dup_ipv4_route(net, skb, gw, oif)) {
+ __this_cpu_write(nf_skb_duplicated, true);
+ ip_local_out(net, skb->sk, skb);
+ __this_cpu_write(nf_skb_duplicated, false);
+ } else {
+ kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv4);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet");
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index e59cc05c0..5075b7ecd 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
oldip = iph->daddr;
newip = t->dst.u3.ip;
}
- inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+ inet_proto_csum_replace4(check, skb, oldip, newip, true);
}
static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
@@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
}
} else
inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), 1);
+ htons(oldlen), htons(datalen), true);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
unsigned int
-nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
/* We never see fragments: conntrack defrags on pre-routing
* and local-out, and nf_nat_out protects post-routing.
@@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
case IP_CT_RELATED_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
+ state->hook))
return NF_DROP;
else
return NF_ACCEPT;
@@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = do_chain(ops, skb, state, ct);
+ ret = do_chain(priv, skb, state, ct);
if (ret != NF_ACCEPT)
return ret;
- if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
break;
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
@@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -345,9 +345,9 @@ oif_changed:
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
unsigned int
-nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
unsigned int
-nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
unsigned int
-nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/kernel/net/ipv4/netfilter/nf_nat_pptp.c b/kernel/net/ipv4/netfilter/nf_nat_pptp.c
index 657d2307f..b3ca21b2b 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_pptp.c
@@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
const struct nf_conn *master = ct->master;
struct nf_conntrack_expect *other_exp;
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
diff --git a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 4557b4ab8..7b98baa13 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb,
hdr = (struct icmphdr *)(skb->data + hdroff);
inet_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+ hdr->un.echo.id, tuple->src.u.icmp.id, false);
hdr->un.echo.id = tuple->src.u.icmp.id;
return true;
}
diff --git a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 7c6766713..ddb894ac1 100644
--- a/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/kernel/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (obj->type == SNMP_IPADDR)
- mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+ mangle_address(ctx.begin, ctx.pointer - 4, map, check);
kfree(obj->id);
kfree(obj);
diff --git a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
index 3262e41ff..c747b2d9e 100644
--- a/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
const struct iphdr *oiph;
@@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
/* "Never happens" */
@@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
diff --git a/kernel/net/ipv4/netfilter/nf_tables_arp.c b/kernel/net/ipv4/netfilter/nf_tables_arp.c
index 8412268bb..9d09d4f59 100644
--- a/kernel/net/ipv4/netfilter/nf_tables_arp.c
+++ b/kernel/net/ipv4/netfilter/nf_tables_arp.c
@@ -15,15 +15,15 @@
#include <net/netfilter/nf_tables.h>
static unsigned int
-nft_do_chain_arp(const struct nf_hook_ops *ops,
+nft_do_chain_arp(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, ops, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
static struct nft_af_info nft_af_arp __read_mostly = {
diff --git a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
index aa180d3a6..ca9dc3c46 100644
--- a/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -18,18 +18,18 @@
#include <net/ip.h>
#include <net/netfilter/nf_tables_ipv4.h>
-static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+static unsigned int nft_do_chain_ipv4(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+static unsigned int nft_ipv4_output(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
- return nft_do_chain_ipv4(ops, skb, state);
+ return nft_do_chain_ipv4(priv, skb, state);
}
struct nft_af_info nft_af_ipv4 __read_mostly = {
diff --git a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index bf5c30ae1..f5c66a7a4 100644
--- a/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -26,44 +26,44 @@
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ip.h>
-static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv4 = {
diff --git a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
index e335b0afd..2375b0a8b 100644
--- a/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -21,7 +21,7 @@
#include <net/route.h>
#include <net/ip.h>
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+static unsigned int nf_route_table_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
mark = skb->mark;
iph = ip_hdr(skb);
@@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
daddr = iph->daddr;
tos = iph->tos;
- ret = nft_do_chain(&pkt, ops);
+ ret = nft_do_chain(&pkt, priv);
if (ret != NF_DROP && ret != NF_QUEUE) {
iph = ip_hdr(skb);
@@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
+ if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
ret = NF_DROP;
}
return ret;
diff --git a/kernel/net/ipv4/netfilter/nft_dup_ipv4.c b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 000000000..bf855e64f
--- /dev/null
+++ b/kernel/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+ enum nft_registers sreg_addr:8;
+ enum nft_registers sreg_dev:8;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ struct in_addr gw = {
+ .s_addr = (__force __be32)regs->data[priv->sreg_addr],
+ };
+ int oif = regs->data[priv->sreg_dev];
+
+ nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+ return -EINVAL;
+
+ priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
+ err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DUP_SREG_DEV] != NULL) {
+ priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+ return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ }
+ return 0;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+ nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+ .type = &nft_dup_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+ .eval = nft_dup_ipv4_eval,
+ .init = nft_dup_ipv4_init,
+ .dump = nft_dup_ipv4_dump,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "dup",
+ .ops = &nft_dup_ipv4_ops,
+ .policy = nft_dup_ipv4_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
diff --git a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
index 40e414c4c..b72ffc58e 100644
--- a/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
index d8d795df9..c09d43814 100644
--- a/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
mr.range[0].flags |= priv->flags;
regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->ops->hooknum);
+ pkt->hook);
}
static struct nft_expr_type nft_redir_ipv4_type;
diff --git a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
index b07e58b51..c24f41c81 100644
--- a/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/kernel/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
default:
break;
diff --git a/kernel/net/ipv4/ping.c b/kernel/net/ipv4/ping.c
index 05ff44b75..aa67e0e64 100644
--- a/kernel/net/ipv4/ping.c
+++ b/kernel/net/ipv4/ping.c
@@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(isk->freebind || isk->transparent || has_addr ||
+ if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+ isk->freebind || isk->transparent || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
@@ -745,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
}
diff --git a/kernel/net/ipv4/proc.c b/kernel/net/ipv4/proc.c
index e1f3b911d..3abd9d7a3 100644
--- a/kernel/net/ipv4/proc.c
+++ b/kernel/net/ipv4/proc.c
@@ -298,6 +298,10 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+ SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+ SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+ SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
SNMP_MIB_SENTINEL
};
diff --git a/kernel/net/ipv4/raw.c b/kernel/net/ipv4/raw.c
index 561cd4b8f..7113bae4e 100644
--- a/kernel/net/ipv4/raw.c
+++ b/kernel/net/ipv4/raw.c
@@ -406,13 +406,16 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->transport_header += iphlen;
+ if (iph->protocol == IPPROTO_ICMP &&
+ length >= iphlen + sizeof(struct icmphdr))
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
}
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
- err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
- NULL, rt->dst.dev, dst_output_sk);
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, rt->dst.dev,
+ dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -483,6 +486,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
@@ -542,9 +546,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ err = ip_cmsg_send(net, msg, &ipc, false);
+ if (unlikely(err)) {
+ kfree(ipc.opt);
goto out;
+ }
if (ipc.opt)
free = 1;
}
@@ -597,6 +603,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (err < 0)
+ goto done;
+ }
+
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -607,7 +619,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
diff --git a/kernel/net/ipv4/route.c b/kernel/net/ipv4/route.c
index f45f2a12f..02c62299d 100644
--- a/kernel/net/ipv4/route.c
+++ b/kernel/net/ipv4/route.c
@@ -91,6 +91,7 @@
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
+#include <net/dst_metadata.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -102,6 +103,7 @@
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
+#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
@@ -109,6 +111,8 @@
#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -125,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
/*
* Interface to generic destination cache.
*/
@@ -457,12 +462,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
}
#define IP_IDENTS_SZ 2048u
-struct ip_ident_bucket {
- atomic_t id;
- u32 stamp32;
-};
-static struct ip_ident_bucket *ip_idents __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
/* In order to protect privacy, we add a perturbation to identifiers
* if one generator is seldom used. This makes hard for an attacker
@@ -470,15 +472,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly;
*/
u32 ip_idents_reserve(u32 hash, int segs)
{
- struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
+ atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
u32 delta = 0;
- if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, &bucket->id) - segs;
+ return atomic_add_return(segs + delta, p_id) - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -749,11 +752,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
- if (fib_lookup(net, fl4, &res) == 0) {
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, new_gw,
- 0, 0);
+ 0, jiffies + ip_rt_gc_timeout);
}
if (kill_route)
rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -836,6 +839,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
struct inet_peer *peer;
struct net *net;
int log_martians;
+ int vif;
rcu_read_lock();
in_dev = __in_dev_get_rcu(rt->dst.dev);
@@ -844,10 +848,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
return;
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+ vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
rcu_read_unlock();
net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
if (!peer) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
rt_nexthop(rt, ip_hdr(skb)->daddr));
@@ -936,7 +941,8 @@ static int ip_error(struct sk_buff *skb)
break;
}
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+ l3mdev_master_ifindex(skb->dev), 1);
send = true;
if (peer) {
@@ -977,7 +983,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1147,7 +1153,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1188,7 +1194,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
fl4.flowi4_mark = skb->mark;
rcu_read_lock();
- if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
else
src = inet_select_addr(rt->dst.dev,
@@ -1405,6 +1411,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
+ rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr);
else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1432,12 +1439,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
}
static struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
bool nopolicy, bool noxfrm, bool will_cache)
{
- return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
- (nopolicy ? DST_NOPOLICY : 0) |
- (noxfrm ? DST_NOXFRM : 0));
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
+
+ if (rt) {
+ rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rt->rt_flags = flags;
+ rt->rt_type = type;
+ rt->rt_is_input = 0;
+ rt->rt_iif = 0;
+ rt->rt_pmtu = 0;
+ rt->rt_gateway = 0;
+ rt->rt_uses_gateway = 0;
+ rt->rt_table_id = 0;
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
+ rt->dst.output = ip_output;
+ if (flags & RTCF_LOCAL)
+ rt->dst.input = ip_local_deliver;
+ }
+
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1446,6 +1475,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
u32 itag = 0;
int err;
@@ -1458,9 +1488,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(saddr))
- goto e_inval;
+ if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+ goto e_inval;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
@@ -1471,7 +1500,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ if (our)
+ flags |= RTCF_LOCAL;
+
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
@@ -1480,20 +1512,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->dst.tclassid = itag;
#endif
rth->dst.output = ip_rt_bug;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev));
- rth->rt_flags = RTCF_MULTICAST;
- rth->rt_type = RTN_MULTICAST;
rth->rt_is_input= 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
- if (our) {
- rth->dst.input= ip_local_deliver;
- rth->rt_flags |= RTCF_LOCAL;
- }
#ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
@@ -1538,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1548,7 +1597,6 @@ static int __mkroute_input(struct sk_buff *skb,
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned int flags = 0;
bool do_cache;
u32 itag = 0;
@@ -1592,18 +1640,27 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe)
+ if (fnhe) {
rth = rcu_dereference(fnhe->fnhe_rth_input);
- else
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(&FIB_RES_NH(*res), daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
+ }
+ }
+
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+rt_cache:
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
}
}
- rth = rt_dst_alloc(out_dev->dev,
+ rth = rt_dst_alloc(out_dev->dev, 0, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
@@ -1611,21 +1668,22 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
- rth->rt_flags = flags;
- rth->rt_type = res->type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
- rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1633,6 +1691,48 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ struct icmphdr _icmph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *inner_iph;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto standard_hash;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto standard_hash;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ goto standard_hash;
+ }
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto standard_hash;
+
+ return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+ return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1640,8 +1740,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ int h;
+
+ if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+ h = ip_multipath_icmp_hash(skb);
+ else
+ h = fib_multipath_hash(saddr, daddr);
+ fib_select_multipath(res, h);
+ }
#endif
/* create a routing cache entry */
@@ -1664,6 +1771,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
unsigned int flags = 0;
u32 itag = 0;
@@ -1681,10 +1789,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+ else
+ fl4.flowi4_tun_key.tun_id = 0;
+ skb_dst_drop(skb);
+
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
res.fi = NULL;
+ res.table = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -1712,13 +1828,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
- err = fib_lookup(net, &fl4, &res);
+ err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
err = -EHOSTUNREACH;
@@ -1732,7 +1849,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
goto local_input;
}
@@ -1754,7 +1871,7 @@ brd_input:
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
@@ -1774,26 +1891,19 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev,
+ rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
- rth->dst.input= ip_local_deliver;
rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
-
- rth->rt_genid = rt_genid_ipv4(net);
- rth->rt_flags = flags|RTCF_LOCAL;
- rth->rt_type = res.type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res.table)
+ rth->rt_table_id = res.table->tb_id;
+
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -1814,6 +1924,7 @@ no_route:
RT_CACHE_STAT_INC(in_no_route);
res.type = RTN_UNREACHABLE;
res.fi = NULL;
+ res.table = NULL;
goto local_input;
/*
@@ -1836,8 +1947,6 @@ e_nobufs:
goto out;
martian_source:
- err = -EINVAL;
-martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto out;
}
@@ -1945,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
- if (fnhe)
+ if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- else {
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
+ rth = rcu_dereference(*prth);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(nh, fl4->daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
}
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
+
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
+
+rt_cache:
if (rt_cache_valid(rth)) {
dst_hold(&rth->dst);
return rth;
@@ -1965,29 +2084,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
add:
- rth = rt_dst_alloc(dev_out,
+ rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
do_cache);
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->dst.output = ip_output;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
- rth->rt_flags = flags;
- rth->rt_type = type;
- rth->rt_is_input = 0;
rth->rt_iif = orig_oif ? : 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL)
- rth->dst.input = ip_local_deliver;
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
@@ -2006,6 +2115,8 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ if (lwtunnel_output_redirect(rth->dst.lwtstate))
+ rth->dst.output = lwtunnel_output;
return rth;
}
@@ -2014,7 +2125,8 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2022,6 +2134,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
struct fib_result res;
struct rtable *rth;
int orig_oif;
+ int err = -ENETUNREACH;
res.tclassid = 0;
res.fi = NULL;
@@ -2097,7 +2210,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto out;
}
if (ipv4_is_local_multicast(fl4->daddr) ||
- ipv4_is_lbcast(fl4->daddr)) {
+ ipv4_is_lbcast(fl4->daddr) ||
+ fl4->flowi4_proto == IPPROTO_IGMP) {
if (!fl4->saddr)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2111,6 +2225,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
+
+ rth = l3mdev_get_rtable(dev_out, fl4);
+ if (rth)
+ goto out;
}
if (!fl4->daddr) {
@@ -2124,10 +2242,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
- if (fib_lookup(net, fl4, &res)) {
+ err = fib_lookup(net, fl4, &res, 0);
+ if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif) {
+ if (fl4->flowi4_oif &&
+ !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2152,7 +2272,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
res.type = RTN_UNICAST;
goto make_route;
}
- rth = ERR_PTR(-ENETUNREACH);
+ rth = ERR_PTR(err);
goto out;
}
@@ -2169,18 +2289,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
- else
-#endif
- if (!res.prefixlen &&
- res.table->tb_num_default > 1 &&
- res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(&res);
-
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, res);
+ fib_select_path(net, &res, fl4, mp_hash);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2193,7 +2302,7 @@ out:
rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2245,7 +2354,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2262,7 +2371,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
-
dst_free(new);
}
@@ -2272,7 +2380,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
}
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
- struct sock *sk)
+ const struct sock *sk)
{
struct rtable *rt = __ip_route_output_key(net, flp4);
@@ -2288,7 +2396,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
u32 seq, int event, int nowait, unsigned int flags)
{
@@ -2308,8 +2416,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = RT_TABLE_MAIN;
- if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ r->rtm_table = table_id;
+ if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
@@ -2414,6 +2522,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int err;
int mark;
struct sk_buff *skb;
+ u32 table_id = RT_TABLE_MAIN;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2449,6 +2558,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+ fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
if (iif) {
struct net_device *dev;
@@ -2483,7 +2595,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(net, dst, src, &fl4, skb,
+ if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+ table_id = rt->rt_table_id;
+
+ err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err < 0)
@@ -2504,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev)
}
#ifdef CONFIG_SYSCTL
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
static int ip_rt_gc_interval __read_mostly = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
@@ -2742,6 +2856,10 @@ int __init ip_rt_init(void)
prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
+ if (!ip_tstamps)
+ panic("IP: failed to allocate ip_tstamps\n");
+
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --git a/kernel/net/ipv4/syncookies.c b/kernel/net/ipv4/syncookies.c
index df849e5a1..4cbe9f0a4 100644
--- a/kernel/net/ipv4/syncookies.c
+++ b/kernel/net/ipv4/syncookies.c
@@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
- __u16 *mssp)
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- tcp_synq_overflow(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return __cookie_v4_init_sequence(iph, th, mssp);
}
@@ -219,23 +215,26 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);
-static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
+ bool own_req;
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ NULL, &own_req);
if (child) {
atomic_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
inet_csk_reqsk_queue_add(sk, req, child);
} else {
reqsk_free(req);
}
return child;
}
-
+EXPORT_SYMBOL(tcp_get_cookie_sock);
/*
* when syncookies are in effect and tcp timestamps are enabled we stored
@@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
}
EXPORT_SYMBOL(cookie_ecn_ok);
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
@@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->snt_synack.v64 = 0;
treq->tfo_listener = false;
ireq->ir_iif = sk->sk_bound_dev_if;
@@ -381,17 +384,17 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
- ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
* Normal sockets get it right from inet_csk_route_child_sock()
*/
diff --git a/kernel/net/ipv4/sysctl_net_ipv4.c b/kernel/net/ipv4/sysctl_net_ipv4.c
index 143f5f380..1866f9102 100644
--- a/kernel/net/ipv4/sysctl_net_ipv4.c
+++ b/kernel/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -45,10 +46,16 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
- write_seqlock(&net->ipv4.ip_local_ports.lock);
+ bool same_parity = !((range[0] ^ range[1]) & 1);
+
+ write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
+ if (same_parity && !net->ipv4.ip_local_ports.warned) {
+ net->ipv4.ip_local_ports.warned = true;
+ pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+ }
net->ipv4.ip_local_ports.range[0] = range[0];
net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.ip_local_ports.lock);
+ write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -489,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_recovery",
+ .data = &sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -570,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_min_rtt_wlen",
+ .data = &sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -702,10 +723,28 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &gso_max_segs,
},
{
+ .procname = "tcp_pacing_ss_ratio",
+ .data = &sysctl_tcp_pacing_ss_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
+ .procname = "tcp_pacing_ca_ratio",
+ .data = &sysctl_tcp_pacing_ca_ratio,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &thousand,
+ },
+ {
.procname = "tcp_autocorking",
.data = &sysctl_tcp_autocorking,
.maxlen = sizeof(int),
@@ -828,6 +867,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_ecn_fallback",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
@@ -904,6 +950,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "igmp_link_local_mcast_reports",
+ .data = &sysctl_igmp_llm_reports,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
diff --git a/kernel/net/ipv4/tcp.c b/kernel/net/ipv4/tcp.c
index bb2ce74f6..036a76ba2 100644
--- a/kernel/net/ipv4/tcp.c
+++ b/kernel/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -388,6 +389,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -450,11 +452,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ int state;
sock_rps_record_flow(sk);
sock_poll_wait(file, sk_sleep(sk), wait);
- if (sk->sk_state == TCP_LISTEN)
+
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
@@ -491,14 +496,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected or passive Fast Open socket? */
- if (sk->sk_state != TCP_SYN_SENT &&
- (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ if (state != TCP_SYN_SENT &&
+ (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -506,9 +511,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
- /* Potential race condition. If read of tp below will
- * escape above sk->sk_state, we can be illegally awaken
- * in SYN_* states. */
if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
@@ -516,8 +518,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
@@ -627,6 +628,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+ tcp_slow_start_after_idle_check(sk);
}
static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
@@ -695,8 +698,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
- ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
- tss->flags);
+ ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+ min(rd_desc->count, len), tss->flags,
+ skb_socket_splice);
if (ret > 0)
rd_desc->count -= ret;
return ret;
@@ -779,7 +783,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
- sk_wait_data(sk, &timeo);
+ sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
break;
@@ -809,16 +813,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
+ if (unlikely(tcp_under_memory_pressure(sk)))
+ sk_mem_reclaim_partial(sk);
+
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
- if (skb) {
- if (sk_wmem_schedule(sk, skb->truesize)) {
+ if (likely(skb)) {
+ bool mem_scheduled;
+
+ if (force_schedule) {
+ mem_scheduled = true;
+ sk_forced_mem_schedule(sk, skb->truesize);
+ } else {
+ mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+ }
+ if (likely(mem_scheduled)) {
skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
@@ -885,11 +901,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto out_err;
}
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
@@ -908,7 +925,8 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -921,7 +939,7 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -951,7 +969,8 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy)) {
+ size -= copy;
+ if (!size) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -972,7 +991,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -987,6 +1007,9 @@ do_error:
if (copied)
goto out;
out_err:
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
return sk_stream_error(sk, flags, err);
}
@@ -1092,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto do_error;
}
@@ -1110,7 +1134,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
/* This should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1144,7 +1168,8 @@ new_segment:
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
- sk->sk_allocation);
+ sk->sk_allocation,
+ skb_queue_empty(&sk->sk_write_queue));
if (!skb)
goto wait_for_memory;
@@ -1187,7 +1212,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i == MAX_SKB_FRAGS || !sg) {
+ if (i == sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1247,7 +1272,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1275,6 +1301,9 @@ do_error:
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
release_sock(sk);
return err;
}
@@ -1554,7 +1583,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
u32 urg_hole = 0;
if (unlikely(flags & MSG_ERRQUEUE))
@@ -1614,7 +1643,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Next get a buffer. */
+ last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
+ last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
@@ -1733,15 +1764,17 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
- } else
- sk_wait_data(sk, &timeo);
+ } else {
+ sk_wait_data(sk, &timeo, last);
+ }
if (user_recv) {
int chunk;
/* __ Restore normal policy in scheduler __ */
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
@@ -1752,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
do_prequeue:
tcp_prequeue_process(sk);
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
@@ -1900,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk->sk_state = state;
+ sk_state_store(sk, state);
#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2204,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq += tp->max_window + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
@@ -2227,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
-void tcp_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
- kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
@@ -2483,6 +2511,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
icsk->icsk_syn_retries = val;
break;
+ case TCP_SAVE_SYN:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->save_syn = val;
+ break;
+
case TCP_LINGER2:
if (val < 0)
tp->linger2 = -1;
@@ -2548,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
TCPF_LISTEN))) {
tcp_fastopen_init_key_once(true);
- err = fastopen_init_queue(sk, val);
+ fastopen_queue_tune(sk, val);
} else {
err = -EINVAL;
}
@@ -2599,15 +2634,19 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
unsigned int start;
+ u64 rate64;
u32 rate;
memset(info, 0, sizeof(*info));
+ if (sk->sk_type != SOCK_STREAM)
+ return;
+
+ info->tcpi_state = sk_state_load(sk);
- info->tcpi_state = sk->sk_state;
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2635,7 +2674,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (sk->sk_state == TCP_LISTEN) {
+ if (info->tcpi_state == TCP_LISTEN) {
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
@@ -2665,16 +2704,20 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate);
- info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
- info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_max_pacing_rate);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
- info->tcpi_bytes_acked = tp->bytes_acked;
- info->tcpi_bytes_received = tp->bytes_received;
+ put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+ put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+ info->tcpi_segs_out = tp->segs_out;
+ info->tcpi_segs_in = tp->segs_in;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2812,10 +2855,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- if (icsk->icsk_accept_queue.fastopenq)
- val = icsk->icsk_accept_queue.fastopenq->max_qlen;
- else
- val = 0;
+ val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
case TCP_TIMESTAMP:
@@ -2824,6 +2864,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_SAVE_SYN:
+ val = tp->save_syn;
+ break;
+ case TCP_SAVED_SYN: {
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (tp->saved_syn) {
+ if (len < tp->saved_syn[0]) {
+ if (put_user(tp->saved_syn[0], optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ release_sock(sk);
+ return -EINVAL;
+ }
+ len = tp->saved_syn[0];
+ if (put_user(len, optlen)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ tcp_saved_syn_free(tp);
+ release_sock(sk);
+ } else {
+ release_sock(sk);
+ len = 0;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -3028,11 +3104,12 @@ __setup("thash_entries=", set_thash_entries);
static void __init tcp_init_mem(void)
{
- unsigned long limit = nr_free_buffer_pages() / 8;
+ unsigned long limit = nr_free_buffer_pages() / 16;
+
limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
+ sysctl_tcp_mem[1] = limit; /* 6.25 % */
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
void __init tcp_init(void)
diff --git a/kernel/net/ipv4/tcp_bic.c b/kernel/net/ipv4/tcp_bic.c
index c037644ea..fd1405d37 100644
--- a/kernel/net/ipv4/tcp_bic.c
+++ b/kernel/net/ipv4/tcp_bic.c
@@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
diff --git a/kernel/net/ipv4/tcp_cdg.c b/kernel/net/ipv4/tcp_cdg.c
new file mode 100644
index 000000000..167b6a3e1
--- /dev/null
+++ b/kernel/net/ipv4/tcp_cdg.c
@@ -0,0 +1,433 @@
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011.
+ *
+ * Scavenger traffic (Less-than-Best-Effort) should disable coexistence
+ * heuristics using parameters use_shadow=0 and use_ineff=0.
+ *
+ * Parameters window, backoff_beta, and backoff_factor are crucial for
+ * throughput and delay. Future work is needed to determine better defaults,
+ * and to provide guidelines for use in different environments/contexts.
+ *
+ * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/.
+ * Parameter window is only configurable when loading tcp_cdg as a module.
+ *
+ * Notable differences from paper/FreeBSD:
+ * o Using Hybrid Slow start and Proportional Rate Reduction.
+ * o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ * o Add toggle for non-congestion loss tolerance.
+ * o Scaling parameter G is changed to a backoff factor;
+ * conversion is given by: backoff_factor = 1000/(G * window).
+ * o Limit shadow window to 2 * cwnd, or to cwnd when application limited.
+ * o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define HYSTART_ACK_TRAIN 1
+#define HYSTART_DELAY 2
+
+static int window __read_mostly = 8;
+static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */
+static unsigned int backoff_factor __read_mostly = 42;
+static unsigned int hystart_detect __read_mostly = 3;
+static unsigned int use_ineff __read_mostly = 5;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
+module_param(backoff_beta, uint, 0644);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
+module_param(backoff_factor, uint, 0644);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
+module_param(hystart_detect, uint, 0644);
+MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
+ "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
+module_param(use_ineff, uint, 0644);
+MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+
+struct minmax {
+ union {
+ struct {
+ s32 min;
+ s32 max;
+ };
+ u64 v64;
+ };
+};
+
+enum cdg_state {
+ CDG_UNKNOWN = 0,
+ CDG_NONFULL = 1,
+ CDG_FULL = 2,
+ CDG_BACKOFF = 3,
+};
+
+struct cdg {
+ struct minmax rtt;
+ struct minmax rtt_prev;
+ struct minmax *gradients;
+ struct minmax gsum;
+ bool gfilled;
+ u8 tail;
+ u8 state;
+ u8 delack;
+ u32 rtt_seq;
+ u32 undo_cwnd;
+ u32 shadow_wnd;
+ u16 backoff_cnt;
+ u16 sample_cnt;
+ s32 delay_min;
+ u32 last_ack;
+ u32 round_start;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+ static const u16 v[] = {
+ /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+ 65535,
+ 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+ 61378, 57484, 50423, 38795, 22965, 8047, 987, 14,
+ };
+ u32 msb = ux >> 8;
+ u32 res;
+ int i;
+
+ /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+ if (msb > U16_MAX)
+ return 0;
+
+ /* Scale first eight bits linearly: */
+ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+ /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+ for (i = 1; msb; i++, msb >>= 1) {
+ u32 y = v[i & -(msb & 1)] + U32_C(1);
+
+ res = ((u64)res * y) >> 16;
+ }
+
+ return res;
+}
+
+/* Based on the HyStart algorithm (by Ha et al.) that is implemented in
+ * tcp_cubic. Differences/experimental changes:
+ * o Using Hayes' delayed ACK filter.
+ * o Using a usec clock for the ACK train.
+ * o Reset ACK train when application limited.
+ * o Invoked at any cwnd (i.e. also when cwnd < 16).
+ * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh).
+ */
+static void tcp_cdg_hystart_update(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
+ if (ca->delay_min == 0)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+
+ if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
+ ca->last_ack = now_us;
+ ca->round_start = now_us;
+ } else if (before(now_us, ca->last_ack + 3000)) {
+ u32 base_owd = max(ca->delay_min / 2U, 125U);
+
+ ca->last_ack = now_us;
+ if (after(now_us, ca->round_start + base_owd)) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ return;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ if (ca->sample_cnt < 8) {
+ ca->sample_cnt++;
+ } else {
+ s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
+ 125U);
+
+ if (ca->rtt.min > thresh) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+static s32 tcp_cdg_grad(struct cdg *ca)
+{
+ s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+ s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+ s32 grad;
+
+ if (ca->gradients) {
+ ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+ ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+ ca->gradients[ca->tail].min = gmin;
+ ca->gradients[ca->tail].max = gmax;
+ ca->tail = (ca->tail + 1) & (window - 1);
+ gmin = ca->gsum.min;
+ gmax = ca->gsum.max;
+ }
+
+ /* We keep sums to ignore gradients during cwnd reductions;
+ * the paper's smoothed gradients otherwise simplify to:
+ * (rtt_latest - rtt_oldest) / window.
+ *
+ * We also drop division by window here.
+ */
+ grad = gmin > 0 ? gmin : gmax;
+
+ /* Extrapolate missing values in gradient window: */
+ if (!ca->gfilled) {
+ if (!ca->gradients && window > 1)
+ grad *= window; /* Memory allocation failed. */
+ else if (ca->tail == 0)
+ ca->gfilled = true;
+ else
+ grad = (grad * window) / (int)ca->tail;
+ }
+
+ /* Backoff was effectual: */
+ if (gmin <= -32 || gmax <= -32)
+ ca->backoff_cnt = 0;
+
+ if (use_tolerance) {
+ /* Reduce small variations to zero: */
+ gmin = DIV_ROUND_CLOSEST(gmin, 64);
+ gmax = DIV_ROUND_CLOSEST(gmax, 64);
+
+ if (gmin > 0 && gmax <= 0)
+ ca->state = CDG_FULL;
+ else if ((gmin > 0 && gmax > 0) || gmax < 0)
+ ca->state = CDG_NONFULL;
+ }
+ return grad;
+}
+
+static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (prandom_u32() <= nexp_u32(grad * backoff_factor))
+ return false;
+
+ if (use_ineff) {
+ ca->backoff_cnt++;
+ if (ca->backoff_cnt > use_ineff)
+ return false;
+ }
+
+ ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+ ca->state = CDG_BACKOFF;
+ tcp_enter_cwr(sk);
+ return true;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_cwnd;
+ u32 incr;
+
+ if (tcp_in_slow_start(tp) && hystart_detect)
+ tcp_cdg_hystart_update(sk);
+
+ if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+ s32 grad = 0;
+
+ if (ca->rtt_prev.v64)
+ grad = tcp_cdg_grad(ca);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ ca->last_ack = 0;
+ ca->sample_cnt = 0;
+
+ if (grad > 0 && tcp_cdg_backoff(sk, grad))
+ return;
+ }
+
+ if (!tcp_is_cwnd_limited(sk)) {
+ ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+ return;
+ }
+
+ prior_snd_cwnd = tp->snd_cwnd;
+ tcp_reno_cong_avoid(sk, ack, acked);
+
+ incr = tp->snd_cwnd - prior_snd_cwnd;
+ ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
+}
+
+static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rtt_us <= 0)
+ return;
+
+ /* A heuristic for filtering delayed ACKs, adapted from:
+ * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+ * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+ */
+ if (tp->sacked_out == 0) {
+ if (num_acked == 1 && ca->delack) {
+ /* A delayed ACK is only used for the minimum if it is
+ * provenly lower than an existing non-zero minimum.
+ */
+ ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->delack--;
+ return;
+ } else if (num_acked > 1 && ca->delack < 5) {
+ ca->delack++;
+ }
+ }
+
+ ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
+ ca->rtt.max = max(ca->rtt.max, rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->undo_cwnd = tp->snd_cwnd;
+
+ if (ca->state == CDG_BACKOFF)
+ return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
+
+ if (ca->state == CDG_NONFULL && use_tolerance)
+ return tp->snd_cwnd;
+
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd);
+ if (use_shadow)
+ return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
+ return max(2U, tp->snd_cwnd >> 1);
+}
+
+static u32 tcp_cdg_undo_cwnd(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct minmax *gradients;
+
+ switch (ev) {
+ case CA_EVENT_CWND_RESTART:
+ gradients = ca->gradients;
+ if (gradients)
+ memset(gradients, 0, window * sizeof(gradients[0]));
+ memset(ca, 0, sizeof(*ca));
+
+ ca->gradients = gradients;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+ break;
+ case CA_EVENT_COMPLETE_CWR:
+ ca->state = CDG_UNKNOWN;
+ ca->rtt_seq = tp->snd_nxt;
+ ca->rtt_prev = ca->rtt;
+ ca->rtt.v64 = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* We silently fall back to window = 1 if allocation fails. */
+ if (window > 1)
+ ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
+ GFP_NOWAIT | __GFP_NOWARN);
+ ca->rtt_seq = tp->snd_nxt;
+ ca->shadow_wnd = tp->snd_cwnd;
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+ struct cdg *ca = inet_csk_ca(sk);
+
+ kfree(ca->gradients);
+}
+
+struct tcp_congestion_ops tcp_cdg __read_mostly = {
+ .cong_avoid = tcp_cdg_cong_avoid,
+ .cwnd_event = tcp_cdg_cwnd_event,
+ .pkts_acked = tcp_cdg_acked,
+ .undo_cwnd = tcp_cdg_undo_cwnd,
+ .ssthresh = tcp_cdg_ssthresh,
+ .release = tcp_cdg_release,
+ .init = tcp_cdg_init,
+ .owner = THIS_MODULE,
+ .name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+ if (backoff_beta > 1024 || window < 1 || window > 256)
+ return -ERANGE;
+ if (!is_power_of_2(window))
+ return -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_cdg);
+ return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
diff --git a/kernel/net/ipv4/tcp_cong.c b/kernel/net/ipv4/tcp_cong.c
index 84be008c9..882caa4e7 100644
--- a/kernel/net/ipv4/tcp_cong.c
+++ b/kernel/net/ipv4/tcp_cong.c
@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(const char *name)
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
- u32 key;
+ u32 key = TCP_CA_UNSPEC;
might_sleep();
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
- key = ca ? ca->key : TCP_CA_UNSPEC;
+ if (ca) {
+ key = ca->key;
+ *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+ }
rcu_read_unlock();
return key;
@@ -170,6 +173,10 @@ out:
*/
if (ca->get_info)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
void tcp_init_congestion_control(struct sock *sk)
@@ -178,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
static void tcp_reinit_congestion_control(struct sock *sk,
@@ -189,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
- if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
+ if (sk->sk_state != TCP_CLOSE)
+ tcp_init_congestion_control(sk);
}
/* Manage refcounts on socket close. */
@@ -365,10 +376,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
*/
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = tp->snd_cwnd + acked;
+ u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
- if (cwnd > tp->snd_ssthresh)
- cwnd = tp->snd_ssthresh + 1;
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -413,7 +422,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In "safe" area, increase. */
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
diff --git a/kernel/net/ipv4/tcp_cubic.c b/kernel/net/ipv4/tcp_cubic.c
index 06d3d665a..448c2615f 100644
--- a/kernel/net/ipv4/tcp_cubic.c
+++ b/kernel/net/ipv4/tcp_cubic.c
@@ -151,6 +151,27 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+ s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
/* calculate the cubic root of x using a table lookup followed by one
* Newton-Raphson iteration.
* Avg err ~= 0.195%
@@ -320,7 +341,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
@@ -439,7 +460,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ if (hystart && tcp_in_slow_start(tp) &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
@@ -450,6 +471,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.cong_avoid = bictcp_cong_avoid,
.set_state = bictcp_state,
.undo_cwnd = bictcp_undo_cwnd,
+ .cwnd_event = bictcp_cwnd_event,
.pkts_acked = bictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
diff --git a/kernel/net/ipv4/tcp_dctcp.c b/kernel/net/ipv4/tcp_dctcp.c
index 4c41c1287..7e538f71f 100644
--- a/kernel/net/ipv4/tcp_dctcp.c
+++ b/kernel/net/ipv4/tcp_dctcp.c
@@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
- /* For avoiding denominator == 1. */
- if (ca->acked_bytes_total == 0)
- ca->acked_bytes_total = 1;
+ u64 bytes_ecn = ca->acked_bytes_ecn;
+ u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
- ca->dctcp_alpha = ca->dctcp_alpha -
- (ca->dctcp_alpha >> dctcp_shift_g) +
- (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
- ca->acked_bytes_total;
- if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
- /* Clamp dctcp_alpha to max. */
- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+ if (bytes_ecn) {
+ /* If dctcp_shift_g == 1, a 32bit value would overflow
+ * after 8 Mbytes.
+ */
+ bytes_ecn <<= (10 - dctcp_shift_g);
+ do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
+ alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
+ }
+ /* dctcp_alpha can be read from dctcp_get_info() without
+ * synchro, so we ask compiler to not use dctcp_alpha
+ * as a temporary variable in prior operations.
+ */
+ WRITE_ONCE(ca->dctcp_alpha, alpha);
dctcp_reset(tp, ca);
}
}
diff --git a/kernel/net/ipv4/tcp_diag.c b/kernel/net/ipv4/tcp_diag.c
index 79b34a0f4..b31604086 100644
--- a/kernel/net/ipv4/tcp_diag.c
+++ b/kernel/net/ipv4/tcp_diag.c
@@ -19,13 +19,14 @@
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN) {
+ if (sk_state_load(sk) == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
- } else {
+ } else if (sk->sk_type == SOCK_STREAM) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+
r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
r->idiag_wqueue = tp->write_seq - tp->snd_una;
}
@@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
+ .idiag_info_size = sizeof(struct tcp_info),
};
static int __init tcp_diag_init(void)
diff --git a/kernel/net/ipv4/tcp_fastopen.c b/kernel/net/ipv4/tcp_fastopen.c
index f9c0fb84e..55be6ac70 100644
--- a/kernel/net/ipv4/tcp_fastopen.c
+++ b/kernel/net/ipv4/tcp_fastopen.c
@@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
return false;
}
-static bool tcp_fastopen_create_child(struct sock *sk,
- struct sk_buff *skb,
- struct dst_entry *dst,
- struct request_sock *req)
+static struct sock *tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
{
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
u32 end_seq;
+ bool own_req;
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ NULL, &own_req);
if (!child)
- return false;
+ return NULL;
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
+ spin_lock(&queue->fastopenq.lock);
+ queue->fastopenq.qlen++;
+ spin_unlock(&queue->fastopenq.lock);
/* Initialize the child socket. Have to fix some values to take
* into account the child is a Fast Open socket and is created
@@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
+ * The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
- atomic_set(&req->rsk_refcnt, 1);
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
+ atomic_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
inet_csk(child)->icsk_af_ops->rebuild_header(child);
@@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- * Note that IPv6 might also have used skb_get() trick
- * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
- * So we need to eventually get a clone of the packet,
- * before inserting it in sk_receive_queue.
+ /* Queue the data carried in the SYN packet.
+ * We used to play tricky games with skb_get().
+ * With lockless listener, it is a dead end.
+ * Do not think about it.
*
* XXX (TFO) - we honor a zero-payload TFO request for now,
* (any reason not to?) but no need to queue the skb since
@@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
*/
end_seq = TCP_SKB_CB(skb)->end_seq;
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2;
-
- if (unlikely(skb_shared(skb)))
- skb2 = skb_clone(skb, GFP_ATOMIC);
- else
- skb2 = skb_get(skb);
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (likely(skb2)) {
skb_dst_drop(skb2);
@@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
}
}
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- sock_put(child);
- WARN_ON(!req->sk);
- return true;
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
+ */
+ return child;
}
static bool tcp_fastopen_queue_check(struct sock *sk)
@@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* between qlen overflow causing Fast Open to be disabled
* temporarily vs a server not supporting Fast Open at all.
*/
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (!fastopenq || fastopenq->max_qlen == 0)
+ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
@@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
* cookie request (foc->len == 0).
*/
-bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
{
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
- return false;
+ return NULL;
}
if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
@@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
* data in SYN_RECV state.
*/
fastopen:
- if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ child = tcp_fastopen_create_child(sk, skb, dst, req);
+ if (child) {
foc->len = -1;
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
- return true;
+ return child;
}
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
@@ -308,6 +302,5 @@ fastopen:
valid_foc.exp = foc->exp;
*foc = valid_foc;
- return false;
+ return NULL;
}
-EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/kernel/net/ipv4/tcp_highspeed.c b/kernel/net/ipv4/tcp_highspeed.c
index 882c08aae..db7842495 100644
--- a/kernel/net/ipv4/tcp_highspeed.c
+++ b/kernel/net/ipv4/tcp_highspeed.c
@@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
diff --git a/kernel/net/ipv4/tcp_htcp.c b/kernel/net/ipv4/tcp_htcp.c
index 58469fff6..82f0d9ed6 100644
--- a/kernel/net/ipv4/tcp_htcp.c
+++ b/kernel/net/ipv4/tcp_htcp.c
@@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
diff --git a/kernel/net/ipv4/tcp_hybla.c b/kernel/net/ipv4/tcp_hybla.c
index f963b274f..083831e35 100644
--- a/kernel/net/ipv4/tcp_hybla.c
+++ b/kernel/net/ipv4/tcp_hybla.c
@@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rho_fractions = ca->rho_3ls - (ca->rho << 3);
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
diff --git a/kernel/net/ipv4/tcp_illinois.c b/kernel/net/ipv4/tcp_illinois.c
index f71002e4d..2ab9bbb6f 100644
--- a/kernel/net/ipv4/tcp_illinois.c
+++ b/kernel/net/ipv4/tcp_illinois.c
@@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
/* In slow start */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else {
diff --git a/kernel/net/ipv4/tcp_input.c b/kernel/net/ipv4/tcp_input.c
index c9ab96418..d4c511584 100644
--- a/kernel/net/ipv4/tcp_input.c
+++ b/kernel/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -109,6 +110,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -196,11 +198,13 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
- return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
@@ -359,7 +363,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !sk_under_memory_pressure(sk)) {
+ !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +450,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !sk_under_memory_pressure(sk) &&
+ !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -750,13 +754,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
* TCP pacing, to smooth the burst on large writes when packets
* in flight is significantly lower than cwnd (or rwin)
*/
+int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
+int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
+
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
- rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+ /* current rate is (cwnd * mss) / srtt
+ * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+ * In Congestion Avoidance phase, set it to 120 % the current rate.
+ *
+ * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+ * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+ * end of slow start and should slow down.
+ */
+ if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+ rate *= sysctl_tcp_pacing_ss_ratio;
+ else
+ rate *= sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
@@ -861,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0)
tcp_disable_early_retrans(tp);
+ tp->rack.reord = 1;
}
/* This must be called before lost_out is incremented */
@@ -886,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1028,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window);
}
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- u32 new_low_seq = tp->snd_nxt;
- u32 received_upto = tcp_highest_sack_seq(tp);
-
- if (!tcp_is_fack(tp) || !tp->retrans_out ||
- !after(received_upto, tp->lost_retrans_low) ||
- icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
-
- tcp_for_write_queue(skb, sk) {
- u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
- if (skb == tcp_send_head(sk))
- break;
- if (cnt == tp->retrans_out)
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* TODO: We would like to get rid of tcp_is_fack(tp) only
- * constraint here (see above) but figuring out that at
- * least tp->reordering SACK blocks reside between ack_seq
- * and received_upto is not easy task to do cheaply with
- * the available datastructures.
- *
- * Whether FACK should check here for tp->reordering segs
- * in-between one could argue for either way (it would be
- * rather simple to implement as we could count fack_count
- * during the walk and do tp->fackets_out - fack_count).
- */
- if (after(received_upto, ack_seq)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
- } else {
- if (before(ack_seq, new_low_seq))
- new_low_seq = ack_seq;
- cnt += tcp_skb_pcount(skb);
- }
- }
-
- if (tp->retrans_out)
- tp->lost_retrans_low = new_low_seq;
-}
-
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
@@ -1130,7 +1086,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sacktag_state {
int reord;
int fack_count;
- long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ struct skb_mstamp first_sackt;
+ struct skb_mstamp last_sackt;
int flag;
};
@@ -1212,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, xmit_time, sacked);
+
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
@@ -1233,14 +1196,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
state->reord);
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
- /* Pick the earliest sequence sacked for RTT */
- if (state->rtt_us < 0) {
- struct skb_mstamp now;
-
- skb_mstamp_get(&now);
- state->rtt_us = skb_mstamp_us_delta(&now,
- xmit_time);
- }
+ if (state->first_sackt.v64 == 0)
+ state->first_sackt = *xmit_time;
+ state->last_sackt = *xmit_time;
}
if (sacked & TCPCB_LOST) {
@@ -1316,16 +1274,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
* code can come after this skb later on it's better to keep
* setting gso_size to something.
*/
- if (!skb_shinfo(prev)->gso_size) {
- skb_shinfo(prev)->gso_size = mss;
- skb_shinfo(prev)->gso_type = sk->sk_gso_type;
- }
+ if (!TCP_SKB_CB(prev)->tcp_gso_size)
+ TCP_SKB_CB(prev)->tcp_gso_size = mss;
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (tcp_skb_pcount(skb) <= 1) {
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
- }
+ if (tcp_skb_pcount(skb) <= 1)
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1634,7 +1588,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
- u32 prior_snd_una, long *sack_rtt_us)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1642,7 +1596,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
- struct tcp_sacktag_state state;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
@@ -1650,9 +1603,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
int i, j;
int first_sack_index;
- state.flag = 0;
- state.reord = tp->packets_out;
- state.rtt_us = -1L;
+ state->flag = 0;
+ state->reord = tp->packets_out;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1663,7 +1615,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
- state.flag |= FLAG_DSACKING_ACK;
+ state->flag |= FLAG_DSACKING_ACK;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1728,7 +1680,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
skb = tcp_write_queue_head(sk);
- state.fack_count = 0;
+ state->fack_count = 0;
i = 0;
if (!tp->sacked_out) {
@@ -1762,10 +1714,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, &state,
+ skb = tcp_sacktag_skip(skb, sk, state,
start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
- &state,
+ state,
start_seq,
cache->start_seq,
dup_sack);
@@ -1776,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
- &state,
+ state,
cache->end_seq);
/* ...tail remains todo... */
@@ -1785,12 +1737,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
cache++;
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1800,12 +1752,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
skb = tcp_highest_sack(sk);
if (!skb)
break;
- state.fack_count = tp->fackets_out;
+ state->fack_count = tp->fackets_out;
}
- skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, state, start_seq);
walk:
- skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ skb = tcp_sacktag_walk(skb, sk, next_dup, state,
start_seq, end_seq, dup_sack);
advance_sp:
@@ -1820,11 +1772,10 @@ advance_sp:
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
- if ((state.reord < tp->fackets_out) &&
+ if ((state->reord < tp->fackets_out) &&
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
- tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+ tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk);
tcp_verify_left_out(tp);
out:
@@ -1834,8 +1785,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
- *sack_rtt_us = state.rtt_us;
- return state.flag;
+ return state->flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
@@ -1924,14 +1874,13 @@ void tcp_enter_loss(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- bool new_recovery = false;
+ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
- new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2255,7 +2204,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
(oldcnt >= packets))
break;
- mss = skb_shinfo(skb)->gso_size;
+ mss = tcp_skb_mss(skb);
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
mss, GFP_ATOMIC);
if (err < 0)
@@ -2303,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+ tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
@@ -2482,15 +2446,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
return false;
}
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
- * 2) If packets in flight is lower than ssthresh (such as due to excess
- * losses and/or application stalls), do not perform any further cwnd
- * reductions, but instead slow start up to ssthresh.
+ * 2) Otherwise PRR uses packet conservation to send as much as delivered.
+ * But when the retransmits are acked without further losses, PRR
+ * slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
{
@@ -2507,7 +2470,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit)
+ int fast_rexmit, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2515,17 +2478,22 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
int newly_acked_sacked = prior_unsacked -
(tp->packets_out - tp->sacked_out);
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
+
tp->prr_delivered += newly_acked_sacked;
- if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else {
+ } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
+ !(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
+ } else {
+ sndcnt = min(delta, newly_acked_sacked);
}
-
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2555,6 +2523,7 @@ void tcp_enter_cwr(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
+EXPORT_SYMBOL(tcp_enter_cwr);
static void tcp_try_keep_open(struct sock *sk)
{
@@ -2585,7 +2554,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
} else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2595,6 +2564,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2614,6 +2584,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2682,7 +2653,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk);
@@ -2742,7 +2713,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked)
+ const int prior_unsacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2758,7 +2729,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* mark more packets lost or retransmit more.
*/
if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
return true;
}
@@ -2838,6 +2809,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+ tcp_rack_mark_lost(sk))
+ flag |= FLAG_LOST_RETRANS;
+
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2845,7 +2821,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2858,9 +2834,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ !(flag & FLAG_LOST_RETRANS))
return;
- /* Fall through to processing in Open state. */
+ /* Change state if cwnd is undone or retransmits are lost */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2895,12 +2872,73 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk);
}
+/* Kathleen Nichols' algorithm for tracking the minimum value of
+ * a data stream over some fixed time interval. (E.g., the minimum
+ * RTT over the past five minutes.) It uses constant space and constant
+ * time per update yet almost always delivers the same minimum as an
+ * implementation that has to keep all the data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of the
+ * n'th best >= n-1'th best. It also makes sure that the three values
+ * are widely separated in the time window since that bounds the worse
+ * case error when that data is monotonically increasing over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because it
+ * has no value - the new min is <= everything else in the window by
+ * definition and it's the most recent. So we restart fresh on every new min
+ * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+ * best.
+ */
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+{
+ const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+ struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+ struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ u32 elapsed;
+
+ /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+ if (unlikely(rttm.rtt <= m[0].rtt))
+ m[0] = m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[1].rtt)
+ m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[2].rtt)
+ m[2] = rttm;
+
+ elapsed = now - m[0].ts;
+ if (unlikely(elapsed > wlen)) {
+ /* Passed entire window without a new min so make 2nd choice
+ * the new min & 3rd choice the new 2nd. So forth and so on.
+ */
+ m[0] = m[1];
+ m[1] = m[2];
+ m[2] = rttm;
+ if (now - m[0].ts > wlen) {
+ m[0] = m[1];
+ m[1] = rttm;
+ if (now - m[0].ts > wlen)
+ m[0] = rttm;
+ }
+ } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+ /* Passed a quarter of the window without a new min so
+ * take 2nd choice from the 2nd quarter of the window.
+ */
+ m[2] = m[1] = rttm;
+ } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+ /* Passed half the window without a new min so take the 3rd
+ * choice from the last half of the window.
+ */
+ m[2] = rttm;
+ }
+}
+
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us)
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2909,9 +2947,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
- if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt_us = -1L;
-
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
@@ -2923,11 +2958,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-
+ seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0)
return false;
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -2937,21 +2977,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
- long seq_rtt_us = -1L;
+ long rtt_us = -1L;
- if (synack_stamp && !tp->total_retrans)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+ if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
+ struct skb_mstamp now;
- /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
- * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
- */
- if (!tp->srtt_us)
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
+ skb_mstamp_get(&now);
+ rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
+ }
+
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
}
+
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3055,7 +3095,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una, long sack_rtt_us)
+ u32 prior_snd_una,
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt, now;
@@ -3063,8 +3104,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
- long ca_seq_rtt_us = -1L;
+ long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
+ long ca_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
bool rtt_update;
@@ -3113,6 +3155,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
+ else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3151,17 +3195,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (likely(first_ackt.v64)) {
+ if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+ if (sack->first_sackt.v64) {
+ sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us);
if (flag & FLAG_ACKED) {
- const struct tcp_congestion_ops *ca_ops
- = inet_csk(sk)->icsk_ca_ops;
-
tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
@@ -3184,11 +3230,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
-
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3198,6 +3239,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
@@ -3238,7 +3282,7 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
- unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
when, TCP_RTO_MAX);
@@ -3331,6 +3375,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->pred_flags = 0;
tcp_fast_path_check(sk);
+ if (tcp_send_head(sk))
+ tcp_slow_start_after_idle_check(sk);
+
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
@@ -3466,6 +3513,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sacktag_state sack_state;
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3474,7 +3522,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
- long sack_rtt_us = -1L;
+
+ sack_state.first_sackt.v64 = 0;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3538,7 +3587,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
@@ -3563,13 +3612,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
- sack_rtt_us);
+ &sack_state);
acked -= tp->packets_out;
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, prior_unsacked,
@@ -3578,6 +3623,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3615,7 +3664,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_rtt_us);
+ &sack_state);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}
@@ -3951,7 +4000,6 @@ void tcp_reset(struct sock *sk)
static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -3963,9 +4011,7 @@ static void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- dst = __sk_dst_get(sk);
- if (!dst || !dst_metric(dst, RTAX_QUICKACK))
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -4438,19 +4484,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
+ int err = -ENOMEM;
+ int data_len = 0;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size, sk->sk_allocation);
+ if (size > PAGE_SIZE) {
+ int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+ data_len = npages << PAGE_SHIFT;
+ size = data_len + (size & ~PAGE_MASK);
+ }
+ skb = alloc_skb_with_frags(size - data_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER,
+ &err, sk->sk_allocation);
if (!skb)
goto err;
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+
if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err)
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4466,7 +4527,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
err_free:
kfree_skb(skb);
err:
- return -ENOMEM;
+ return err;
+
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
@@ -4514,10 +4576,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
- if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
-
+ if (eaten < 0) {
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
@@ -4788,7 +4852,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (sk_under_memory_pressure(sk))
+ else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4832,7 +4896,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
@@ -5451,7 +5515,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5622,6 +5686,7 @@ discard:
}
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->copied_seq = tp->rcv_nxt;
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5677,15 +5742,14 @@ reset_and_undo:
* address independent.
*/
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
- u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5729,7 +5793,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
case TCP_SYN_SENT:
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5764,15 +5828,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (!acceptable)
return 1;
+ if (!tp->srtt_us)
+ tcp_synack_rtt_meas(sk, req);
+
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
- synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
- synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
@@ -5795,7 +5860,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5982,14 +6046,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
+ u32 ecn_ok_dst;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+ ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+ if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ (ecn_ok_dst & DST_FEATURE_ECN_CA))
inet_rsk(req)->ecn_ok = 1;
}
@@ -5999,11 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ skb_mstamp_get(&tcp_rsk(req)->snt_synack);
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
@@ -6019,9 +6086,11 @@ static void tcp_openreq_init(struct request_sock *req,
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener)
+ struct sock *sk_listener,
+ bool attach_listener)
{
- struct request_sock *req = reqsk_alloc(ops, sk_listener);
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
@@ -6041,13 +6110,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(struct sock *sk,
+static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
- struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
@@ -6058,29 +6127,45 @@ static bool tcp_syn_flood_action(struct sock *sk,
#endif
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
- lopt->synflood_warned = 1;
+ if (!queue->synflood_warned &&
+ sysctl_tcp_syncookies != 2 &&
+ xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
- }
+
return want_cookie;
}
+static void tcp_reqsk_record_syn(const struct sock *sk,
+ struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ if (tcp_sk(sk)->save_syn) {
+ u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+ u32 *copy;
+
+ copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+ if (copy) {
+ copy[0] = len;
+ memcpy(&copy[1], skb_network_header(skb), len);
+ req->saved_syn = copy;
+ }
+ }
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
- struct request_sock *req;
struct tcp_sock *tp = tcp_sk(sk);
+ struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
- bool want_cookie = false, fastopen;
+ struct request_sock *req;
+ bool want_cookie = false;
struct flowi fl;
- struct tcp_fastopen_cookie foc = { .len = -1 };
- int err;
-
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
@@ -6104,7 +6189,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops, sk);
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
@@ -6187,19 +6272,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
- fastopen = !want_cookie &&
- tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = af_ops->send_synack(sk, dst, &fl, req,
- skb_get_queue_mapping(skb), &foc);
- if (!fastopen) {
- if (err || want_cookie)
- goto drop_and_free;
-
+ if (!want_cookie) {
+ tcp_reqsk_record_syn(sk, req, skb);
+ fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+ }
+ if (fastopen_sk) {
+ af_ops->send_synack(fastopen_sk, dst, &fl, req,
+ &foc, false);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ } else {
tcp_rsk(req)->tfo_listener = false;
- af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ if (!want_cookie)
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ af_ops->send_synack(sk, dst, &fl, req,
+ &foc, !want_cookie);
+ if (want_cookie)
+ goto drop_and_free;
}
-
+ reqsk_put(req);
return 0;
drop_and_release:
diff --git a/kernel/net/ipv4/tcp_ipv4.c b/kernel/net/ipv4/tcp_ipv4.c
index 441ca6f38..8c7e63163 100644
--- a/kernel/net/ipv4/tcp_ipv4.c
+++ b/kernel/net/ipv4/tcp_ipv4.c
@@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- inet_set_txhash(sk);
+ sk_set_txhash(sk);
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
@@ -312,7 +312,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
-void tcp_req_err(struct sock *sk, u32 seq)
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
@@ -324,17 +324,17 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
- } else {
+ } else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
@@ -384,7 +384,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq,
+ type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -576,7 +581,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -705,7 +710,8 @@ release_sk1:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(struct net *net,
+ struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
@@ -720,7 +726,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb_dst(skb)->dev);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -782,7 +787,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v4_send_ack(sock_net(sk), skb,
+ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@@ -795,15 +801,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
- tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+
+ tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
@@ -818,11 +826,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
- u16 queue_mapping,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,12 +841,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc);
+ skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
ireq->opt);
@@ -865,7 +872,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
*/
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
const union tcp_md5_addr *addr,
int family)
{
@@ -877,7 +884,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -894,7 +901,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
}
EXPORT_SYMBOL(tcp_md5_do_lookup);
-struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
@@ -922,7 +929,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1112,10 +1120,13 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+#endif
+
/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{
+#ifdef CONFIG_TCP_MD5SIG
/*
* This gets called for each TCP segment that arrives
* so we want to be efficient.
@@ -1165,10 +1176,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk,
return true;
}
return false;
-}
#endif
+ return false;
+}
-static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
+static void tcp_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
@@ -1179,7 +1192,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
ireq->opt = tcp_v4_save_options(skb);
}
-static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct flowi *fl,
const struct request_sock *req,
bool *strict)
{
@@ -1218,7 +1232,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack,
- .queue_hash_add = inet_csk_reqsk_queue_hash_add,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1241,9 +1254,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request);
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
-struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
@@ -1277,7 +1292,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- inet_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
@@ -1320,7 +1334,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- __inet_hash_nolisten(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (*own_req)
+ tcp_move_syn(newtp, req);
return newsk;
@@ -1338,34 +1354,11 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
- req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
- if (req) {
- nsk = tcp_check_req(sk, skb, req, false);
- if (!nsk || nsk == sk)
- reqsk_put(req);
- return nsk;
- }
-
- nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
-#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
sk = cookie_v4_check(sk, skb);
#endif
@@ -1373,7 +1366,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1400,17 +1393,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
- if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+
if (!nsk)
goto discard;
-
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(sk, skb);
+ sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1420,7 +1413,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
@@ -1508,7 +1501,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
@@ -1590,6 +1583,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1598,6 +1592,35 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk;
+
+ sk = req->rsk_listener;
+ if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sock_hold(sk);
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+ if (nsk == sk) {
+ reqsk_put(req);
+ } else if (tcp_child_process(sk, nsk, skb)) {
+ tcp_v4_send_reset(nsk, skb);
+ goto discard_and_relse;
+ } else {
+ sock_put(sk);
+ return 0;
+ }
+ }
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1606,26 +1629,25 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
-#endif
nf_reset(skb);
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_incoming_cpu_update(sk);
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v4_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
+ tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1638,6 +1660,7 @@ process:
}
bh_unlock_sock(sk);
+put_and_return:
sock_put(sk);
return ret;
@@ -1646,7 +1669,7 @@ no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+ if (tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
@@ -1670,10 +1693,6 @@ do_time_wait:
goto discard_it;
}
- if (skb->len < (th->doff << 2)) {
- inet_twsk_put(inet_twsk(sk));
- goto bad_packet;
- }
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
@@ -1686,8 +1705,7 @@ do_time_wait:
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
goto process;
}
@@ -1713,8 +1731,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
- dst_hold(dst);
+ if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
@@ -1802,6 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
+ tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
sock_release_memcg(sk);
@@ -1836,35 +1854,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct request_sock *req = cur;
-
- icsk = inet_csk(st->syn_wait_sk);
- req = req->dl_next;
- while (1) {
- while (req) {
- if (req->rsk_ops->family == st->family) {
- cur = req;
- goto out;
- }
- req = req->dl_next;
- }
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
- break;
-get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
- }
- sk = sk_nulls_next(st->syn_wait_sk);
- st->state = TCP_SEQ_STATE_LISTENING;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- } else {
- icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
- goto start_req;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_nulls_next(sk);
- }
+ sk = sk_nulls_next(sk);
get_sk:
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
@@ -1874,16 +1864,6 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
-start_req:
- st->uid = sock_i_uid(sk);
- st->syn_wait_sk = sk;
- st->state = TCP_SEQ_STATE_OPENREQ;
- st->sbucket = 0;
- goto get_req;
- }
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
@@ -2015,7 +1995,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
void *rc = NULL;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
@@ -2074,7 +2053,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
@@ -2099,11 +2077,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
struct tcp_iter_state *st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
@@ -2157,7 +2130,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid)
+ struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies;
@@ -2174,7 +2147,8 @@ static void get_openreq4(const struct request_sock *req,
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
req->num_timeout,
- from_kuid_munged(seq_user_ns(f), uid),
+ from_kuid_munged(seq_user_ns(f),
+ sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2188,12 +2162,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_sock *inet = inet_sk(sk);
- struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
+ int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -2211,17 +2186,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_expires = jiffies;
}
- if (sk->sk_state == TCP_LISTEN)
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
rx_queue = sk->sk_ack_backlog;
else
- /*
- * because we dont lock socket, we might find a transient negative value
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
*/
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
- i, src, srcp, dest, destp, sk->sk_state,
+ i, src, srcp, dest, destp, state,
tp->write_seq - tp->snd_una,
rx_queue,
timer_active,
@@ -2235,8 +2211,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sk->sk_state == TCP_LISTEN ?
- (fastopenq ? fastopenq->max_qlen : 0) :
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
@@ -2275,18 +2251,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait4_sock(v, seq, st->num);
- else
- get_tcp4_sock(v, seq, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(v, seq, st->num, st->uid);
- break;
- }
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq4(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
out:
seq_pad(seq, '\n');
return 0;
@@ -2410,12 +2380,15 @@ static int __net_init tcp_sk_init(struct net *net)
goto fail;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
+
net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
- return 0;
+ return 0;
fail:
tcp_sk_exit(net);
diff --git a/kernel/net/ipv4/tcp_metrics.c b/kernel/net/ipv4/tcp_metrics.c
index a51d63a43..c8cbc2b4b 100644
--- a/kernel/net/ipv4/tcp_metrics.c
+++ b/kernel/net/ipv4/tcp_metrics.c
@@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- if (a->family != b->family)
- return false;
- if (a->family == AF_INET)
- return a->addr.a4 == b->addr.a4;
- return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
+ return inetpeer_addr_cmp(a, b) == 0;
}
struct tcpm_hash_bucket {
@@ -247,14 +243,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
daddr.family = req->rsk_ops->family;
switch (daddr.family) {
case AF_INET:
- saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
- daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
+ inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
+ hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
- daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
+ inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
+ inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
break;
#endif
@@ -285,25 +281,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
struct net *net;
if (tw->tw_family == AF_INET) {
- saddr.family = AF_INET;
- saddr.addr.a4 = tw->tw_rcv_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = tw->tw_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+ inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
+ hash = ipv4_addr_hash(tw->tw_daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (tw->tw_family == AF_INET6) {
if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
- saddr.family = AF_INET;
- saddr.addr.a4 = tw->tw_rcv_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = tw->tw_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
+ inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
+ hash = ipv4_addr_hash(tw->tw_daddr);
} else {
- saddr.family = AF_INET6;
- saddr.addr.in6 = tw->tw_v6_rcv_saddr;
- daddr.family = AF_INET6;
- daddr.addr.in6 = tw->tw_v6_daddr;
+ inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr);
+ inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr);
hash = ipv6_addr_hash(&tw->tw_v6_daddr);
}
}
@@ -335,25 +325,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
struct net *net;
if (sk->sk_family == AF_INET) {
- saddr.family = AF_INET;
- saddr.addr.a4 = inet_sk(sk)->inet_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = inet_sk(sk)->inet_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
- saddr.family = AF_INET;
- saddr.addr.a4 = inet_sk(sk)->inet_saddr;
- daddr.family = AF_INET;
- daddr.addr.a4 = inet_sk(sk)->inet_daddr;
- hash = (__force unsigned int) daddr.addr.a4;
+ inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+ inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+ hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
} else {
- saddr.family = AF_INET6;
- saddr.addr.in6 = sk->sk_v6_rcv_saddr;
- daddr.family = AF_INET6;
- daddr.addr.in6 = sk->sk_v6_daddr;
+ inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
+ inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
hash = ipv6_addr_hash(&sk->sk_v6_daddr);
}
}
@@ -461,7 +445,7 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
tp->snd_cwnd);
}
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ } else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
@@ -796,18 +780,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
switch (tm->tcpm_daddr.family) {
case AF_INET:
if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
- tm->tcpm_daddr.addr.a4) < 0)
+ inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
goto nla_put_failure;
if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
- tm->tcpm_saddr.addr.a4) < 0)
+ inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
goto nla_put_failure;
break;
case AF_INET6:
if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
- &tm->tcpm_daddr.addr.in6) < 0)
+ inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
goto nla_put_failure;
if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
- &tm->tcpm_saddr.addr.in6) < 0)
+ inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
goto nla_put_failure;
break;
default:
@@ -956,20 +940,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
a = info->attrs[v4];
if (a) {
- addr->family = AF_INET;
- addr->addr.a4 = nla_get_in_addr(a);
+ inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
if (hash)
- *hash = (__force unsigned int) addr->addr.a4;
+ *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
return 0;
}
a = info->attrs[v6];
if (a) {
+ struct in6_addr in6;
+
if (nla_len(a) != sizeof(struct in6_addr))
return -EINVAL;
- addr->family = AF_INET6;
- addr->addr.in6 = nla_get_in6_addr(a);
+ in6 = nla_get_in6_addr(a);
+ inetpeer_set_addr_v6(addr, &in6);
if (hash)
- *hash = ipv6_addr_hash(&addr->addr.in6);
+ *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
return 0;
}
return optional ? 1 : -EAFNOSUPPORT;
diff --git a/kernel/net/ipv4/tcp_minisocks.c b/kernel/net/ipv4/tcp_minisocks.c
index 17e7339ee..ac6b1961f 100644
--- a/kernel/net/ipv4/tcp_minisocks.c
+++ b/kernel/net/ipv4/tcp_minisocks.c
@@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_RST;
}
@@ -163,9 +162,9 @@ kill_with_rst:
if (tcp_death_row.sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
- inet_twsk_schedule(tw, tw->tw_timeout);
+ inet_twsk_reschedule(tw, tw->tw_timeout);
else
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -198,12 +197,11 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- inet_twsk_deschedule(tw);
- inet_twsk_put(tw);
+ inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
}
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -253,7 +251,7 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -324,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
} while (0);
#endif
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
-
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
@@ -340,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
inet_twsk_schedule(tw, timeo);
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
@@ -364,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+/* Warning : This function is called without sk_listener being locked.
+ * Be sure to read socket fields once, as their value could change under us.
+ */
void tcp_openreq_init_rwin(struct request_sock *req,
- struct sock *sk, struct dst_entry *dst)
+ const struct sock *sk_listener,
+ const struct dst_entry *dst)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- __u8 rcv_wscale;
+ const struct tcp_sock *tp = tcp_sk(sk_listener);
+ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ int full_space = tcp_full_space(sk_listener);
int mss = dst_metric_advmss(dst);
+ u32 window_clamp;
+ __u8 rcv_wscale;
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
+ window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
+ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
+ tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
@@ -436,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
@@ -451,6 +458,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->segs_in = 0;
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -462,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -471,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
- newtp->lsndtime = treq->snt_synack;
+ newtp->lsndtime = treq->snt_synack.stamp_jiffies;
+ newsk->sk_txhash = treq->txhash;
newtp->last_oow_ack_time = 0;
newtp->total_retrans = req->num_retrans;
@@ -503,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
@@ -538,6 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->rack.mstamp.v64 = 0;
+ newtp->rack.advanced = 0;
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
@@ -565,8 +577,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
-
- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+ bool own_req;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -697,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -754,16 +765,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_drop(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
- /* Warning: caller must not call reqsk_put(req);
- * child stole last reference on it.
- */
- return child;
+ sock_rps_save_rxhash(child, skb);
+ tcp_synack_rtt_meas(child, req);
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
@@ -810,8 +819,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int state = child->sk_state;
if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
- skb->len);
+ ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
diff --git a/kernel/net/ipv4/tcp_offload.c b/kernel/net/ipv4/tcp_offload.c
index 3f7c2fca5..9864a2dba 100644
--- a/kernel/net/ipv4/tcp_offload.c
+++ b/kernel/net/ipv4/tcp_offload.c
@@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen);
- mss = tcp_skb_mss(skb);
+ mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -242,7 +242,7 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
- mss = tcp_skb_mss(p);
+ mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
diff --git a/kernel/net/ipv4/tcp_output.c b/kernel/net/ipv4/tcp_output.c
index 986440b24..9bfc39ff2 100644
--- a/kernel/net/ipv4/tcp_output.c
+++ b/kernel/net/ipv4/tcp_output.c
@@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-/* Default TSQ limit of two TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+/* Default TSQ limit of four TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
@@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk)
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
- * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
- s32 delta = tcp_time_stamp - tp->lsndtime;
- u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 cwnd = tp->snd_cwnd;
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -163,20 +163,17 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
- const struct dst_entry *dst = __sk_dst_get(sk);
- if (sysctl_tcp_slow_start_after_idle &&
- (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
- tcp_cwnd_restart(sk, __sk_dst_get(sk));
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
- (!dst || !dst_metric(dst, RTAX_QUICKACK)))
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
@@ -350,15 +347,20 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
}
}
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ /* tp->ecn_flags are cleared at a later point in time when
+ * SYN ACK is ultimatively being received.
+ */
+ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
- struct sock *sk)
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
- if (inet_rsk(req)->ecn_ok) {
+ if (inet_rsk(req)->ecn_ok)
th->ece = 1;
- if (tcp_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
- }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -393,8 +395,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
@@ -402,8 +402,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -610,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct sock *sk,
- struct request_sock *req,
- unsigned int mss, struct sk_buff *skb,
- struct tcp_out_options *opts,
- const struct tcp_md5sig_key *md5,
- struct tcp_fastopen_cookie *foc)
+static unsigned int tcp_synack_options(struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_md5sig_key *md5,
+ struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -941,9 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(sk, CA_EVENT_TX_START);
-
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk_wmem_alloc.
@@ -994,6 +988,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
tcp_ecn_send(sk, skb, tcp_header_size);
@@ -1018,8 +1013,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
- /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ tp->segs_out += tcp_skb_pcount(skb);
+ /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+ skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0;
@@ -1056,25 +1053,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
-
- /* Make sure we own this skb before messing gso_size/gso_segs */
- WARN_ON_ONCE(skb_cloned(skb));
-
if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tcp_skb_pcount_set(skb, 1);
- shinfo->gso_size = 0;
- shinfo->gso_type = 0;
+ TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else {
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- shinfo->gso_size = mss_now;
- shinfo->gso_type = sk->sk_gso_type;
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
}
}
@@ -1163,7 +1152,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, gfp);
+ buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
@@ -1206,8 +1195,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* If this packet has been sent out already, we must
* adjust the various packet counters.
@@ -1287,7 +1276,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
return 0;
}
@@ -1619,13 +1608,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now)
+static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
@@ -1680,7 +1668,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
@@ -1722,7 +1710,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now, gfp);
- buff = sk_stream_alloc_skb(sk, 0, gfp);
+ buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
@@ -1749,8 +1737,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb, mss_now);
- tcp_set_skb_tso_segs(sk, buff, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
+ tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
__skb_header_release(buff);
@@ -1777,7 +1765,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
+ if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
@@ -1834,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
/* Ok, it looks like it is advisable to defer. */
- if (cong_win < send_win && cong_win < skb->len)
+ if (cong_win < send_win && cong_win <= skb->len)
*is_cwnd_limited = true;
return true;
@@ -1941,7 +1929,7 @@ static int tcp_mtu_probe(struct sock *sk)
}
/* We're allowed to probe. Build it now. */
- nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
sk->sk_wmem_queued += nskb->truesize;
@@ -1984,7 +1972,7 @@ static int tcp_mtu_probe(struct sock *sk)
skb->len, 0);
} else {
__pskb_trim_head(skb, copy);
- tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(skb, mss_now);
}
TCP_SKB_CB(skb)->seq += copy;
}
@@ -1994,7 +1982,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (len >= probe_size)
break;
}
- tcp_init_tso_segs(sk, nskb, nskb->len);
+ tcp_init_tso_segs(nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
* be resegmented into mss-sized pieces by tcp_write_xmit().
@@ -2056,7 +2044,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
- tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
@@ -2067,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
- is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
@@ -2078,7 +2065,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
- if (tso_segs == 1 || !max_segs) {
+ if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
@@ -2091,7 +2078,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
limit = mss_now;
- if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
@@ -2149,10 +2136,11 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
- return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+ return !tp->packets_out && tcp_send_head(sk);
}
bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2172,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (sk->sk_state == TCP_SYN_RECV)
+ if (tp->fastopen_rsk)
return false;
/* TLP is only scheduled when next timer event is RTO. */
@@ -2182,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false;
@@ -2191,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
return false;
/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
- * for delayed ack when there's one outstanding packet.
+ * for delayed ack when there's one outstanding packet. If no RTT
+ * sample is available then probe after TCP_TIMEOUT_INIT.
*/
- timeout = rtt << 1;
+ timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
if (tp->packets_out == 1)
timeout = max_t(u32, timeout,
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
@@ -2229,7 +2218,7 @@ static bool skb_still_in_host_queue(const struct sock *sk,
return false;
}
-/* When probe timeout (PTO) fires, send a new segment if one exists, else
+/* When probe timeout (PTO) fires, try send a new segment if possible, else
* retransmit the last segment.
*/
void tcp_send_loss_probe(struct sock *sk)
@@ -2238,11 +2227,19 @@ void tcp_send_loss_probe(struct sock *sk)
struct sk_buff *skb;
int pcount;
int mss = tcp_current_mss(sk);
- int err = -1;
- if (tcp_send_head(sk)) {
- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
- goto rearm_timer;
+ skb = tcp_send_head(sk);
+ if (skb) {
+ if (tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+ goto rearm_timer;
+ }
+ skb = tcp_write_queue_prev(sk, skb);
+ } else {
+ skb = tcp_write_queue_tail(sk);
}
/* At most one outstanding TLP retransmission. */
@@ -2250,7 +2247,6 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
/* Retransmit last segment. */
- skb = tcp_write_queue_tail(sk);
if (WARN_ON(!skb))
goto rearm_timer;
@@ -2265,26 +2261,24 @@ void tcp_send_loss_probe(struct sock *sk)
if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
- skb = tcp_write_queue_tail(sk);
+ skb = tcp_write_queue_next(sk, skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- err = __tcp_retransmit_skb(sk, skb);
+ if (__tcp_retransmit_skb(sk, skb))
+ goto rearm_timer;
/* Record snd_nxt for loss detection. */
- if (likely(!err))
- tp->tlp_high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = tp->snd_nxt;
+probe_sent:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ /* Reset s.t. tcp_rearm_rto will restart timer from now */
+ inet_csk(sk)->icsk_pending = 0;
rearm_timer:
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
-
- if (likely(!err))
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSSPROBES);
+ tcp_rearm_rto(sk);
}
/* Push out any pending frames which were held back due to
@@ -2392,7 +2386,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
@@ -2610,11 +2604,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (unlikely(oldpcount > 1)) {
if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
- tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_init_tso_segs(skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
}
+ /* RFC3168, section 6.1.1.1. ECN fallback */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Make a copy, if the first transmission SKB clone we made
@@ -2657,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
- if (!tp->retrans_out)
- tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -2666,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2816,8 +2808,10 @@ begin_fwd:
* connection tear down and (memory) recovery.
* Otherwise tcp_send_fin() could be tempted to either delay FIN
* or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
*/
-static void sk_forced_wmem_schedule(struct sock *sk, int size)
+void sk_forced_mem_schedule(struct sock *sk, int size)
{
int amt, status;
@@ -2841,7 +2835,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
- if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+ if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
@@ -2864,7 +2858,7 @@ coalesce:
return;
}
skb_reserve(skb, MAX_TCP_HEADER);
- sk_forced_wmem_schedule(sk, skb->truesize);
+ sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
@@ -2945,20 +2939,22 @@ int tcp_send_synack(struct sock *sk)
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
-struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
- struct tcp_out_options opts;
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th;
- struct sk_buff *skb;
+ const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *md5 = NULL;
+ struct tcp_out_options opts;
+ struct sk_buff *skb;
int tcp_header_size;
+ struct tcphdr *th;
+ u16 user_mss;
int mss;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2966,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
+ if (attach_req) {
+ skb_set_owner_w(skb, req_to_sk(req));
+ } else {
+ /* sk is a const pointer, because we want to express multiple
+ * cpu might call us concurrently.
+ * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
+ }
skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -2984,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc) + sizeof(*th);
+ skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
+ sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2994,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th, sk);
+ tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
@@ -3008,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
+ tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
@@ -3143,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0, copied;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data;
@@ -3176,22 +3183,23 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
- copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
- kfree_skb(syn_data);
- goto fallback;
- }
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
+ if (space) {
+ int copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
}
-
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
fo->data = NULL;
@@ -3242,7 +3250,7 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
@@ -3383,7 +3391,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -3401,6 +3409,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
+ NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -3408,12 +3417,12 @@ void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}
/* Initiate keepalive or window probe from timer. */
-int tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -3441,7 +3450,7 @@ int tcp_write_wakeup(struct sock *sk)
if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb, mss);
+ tcp_set_skb_tso_segs(skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
@@ -3450,8 +3459,8 @@ int tcp_write_wakeup(struct sock *sk)
return err;
} else {
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
- tcp_xmit_probe_skb(sk, 1);
- return tcp_xmit_probe_skb(sk, 0);
+ tcp_xmit_probe_skb(sk, 1, mib);
+ return tcp_xmit_probe_skb(sk, 0, mib);
}
}
@@ -3465,7 +3474,7 @@ void tcp_send_probe0(struct sock *sk)
unsigned long probe_max;
int err;
- err = tcp_write_wakeup(sk);
+ err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || !tcp_send_head(sk)) {
/* Cancel probe timer, if it is not required. */
@@ -3491,17 +3500,18 @@ void tcp_send_probe0(struct sock *sk)
probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- inet_csk_rto_backoff(icsk, probe_max),
+ tcp_probe0_when(sk, probe_max),
TCP_RTO_MAX);
}
-int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
struct flowi fl;
int res;
- res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ tcp_rsk(req)->txhash = net_tx_rndhash();
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/kernel/net/ipv4/tcp_recovery.c b/kernel/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000..5353085fd
--- /dev/null
+++ b/kernel/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+ return 0;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay
+ * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+ * RTT because reordering is often a path property and less related
+ * to queuing or delayed ACKs.
+ *
+ * TODO: measure and adapt to the observed reordering delay, and
+ * use a timer to retransmit like the delayed early retransmit.
+ */
+ reo_wnd = 1000;
+ if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+ tcp_for_write_queue(skb, sk) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Skip ones already (s)acked */
+ if (!after(scb->end_seq, tp->snd_una) ||
+ scb->sacked & TCPCB_SACKED_ACKED)
+ continue;
+
+ if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+ if (skb_mstamp_us_delta(&tp->rack.mstamp,
+ &skb->skb_mstamp) <= reo_wnd)
+ continue;
+
+ /* skb is lost if packet sent later is sacked */
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (scb->sacked & TCPCB_SACKED_RETRANS) {
+ scb->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else if (!(scb->sacked & TCPCB_RETRANS)) {
+ /* Original data are sent sequentially so stop early
+ * b/c the rest are all sent after rack_sent
+ */
+ break;
+ }
+ }
+ return prior_retrans - tp->retrans_out;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked)
+{
+ if (tp->rack.mstamp.v64 &&
+ !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ return;
+
+ if (sacked & TCPCB_RETRANS) {
+ struct skb_mstamp now;
+
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ skb_mstamp_get(&now);
+ if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ return;
+ }
+
+ tp->rack.mstamp = *xmit_time;
+ tp->rack.advanced = 1;
+}
diff --git a/kernel/net/ipv4/tcp_scalable.c b/kernel/net/ipv4/tcp_scalable.c
index 333bcb241..bf5ea9e9b 100644
--- a/kernel/net/ipv4/tcp_scalable.c
+++ b/kernel/net/ipv4/tcp_scalable.c
@@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
diff --git a/kernel/net/ipv4/tcp_timer.c b/kernel/net/ipv4/tcp_timer.c
index 8c65dc147..193ba1fa8 100644
--- a/kernel/net/ipv4/tcp_timer.c
+++ b/kernel/net/ipv4/tcp_timer.c
@@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
}
/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sysctl_tcp_orphan_retries; /* May be zero. */
@@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data)
+ if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
@@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk)
syn_set = true;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Some middle-boxes may black-hole Fast Open _after_
+ * the handshake. Therefore we conservatively disable
+ * Fast Open on this path on recurring timeouts with
+ * few or zero bytes acked after Fast Open.
+ */
+ if (tp->syn_data_acked &&
+ tp->bytes_acked <= tp->rx_opt.mss_clamp) {
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -184,7 +196,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -247,7 +259,7 @@ void tcp_delack_timer_handler(struct sock *sk)
}
out:
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk);
}
@@ -298,7 +310,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
@@ -616,7 +628,7 @@ static void tcp_keepalive_timer (unsigned long data)
tcp_write_err(sk);
goto out;
}
- if (tcp_write_wakeup(sk) <= 0) {
+ if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
@@ -649,4 +661,3 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/kernel/net/ipv4/tcp_vegas.c b/kernel/net/ipv4/tcp_vegas.c
index a6cea1d5e..13951c408 100644
--- a/kernel/net/ipv4/tcp_vegas.c
+++ b/kernel/net/ipv4/tcp_vegas.c
@@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
- if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
* and switch to congestion avoidance.
*/
@@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
- } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ } else if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
@@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
vegas->minRTT = 0x7fffffff;
}
/* Use normal slow start */
- else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ else if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
}
diff --git a/kernel/net/ipv4/tcp_veno.c b/kernel/net/ipv4/tcp_veno.c
index 112151eee..0d094b995 100644
--- a/kernel/net/ipv4/tcp_veno.c
+++ b/kernel/net/ipv4/tcp_veno.c
@@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tcp_in_slow_start(tp)) {
/* Slow start. */
tcp_slow_start(tp, acked);
} else {
diff --git a/kernel/net/ipv4/tcp_yeah.c b/kernel/net/ipv4/tcp_yeah.c
index 17d356629..3e6a472e6 100644
--- a/kernel/net/ipv4/tcp_yeah.c
+++ b/kernel/net/ipv4/tcp_yeah.c
@@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return tp->snd_cwnd - reduction;
+ return max_t(int, tp->snd_cwnd - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
diff --git a/kernel/net/ipv4/udp.c b/kernel/net/ipv4/udp.c
index 1b8c5ba7d..7f8ab46ad 100644
--- a/kernel/net/ipv4/udp.c
+++ b/kernel/net/ipv4/udp.c
@@ -100,7 +100,6 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
-#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
@@ -375,7 +374,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
-
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
return score;
}
@@ -419,6 +419,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -963,8 +966,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc,
sk->sk_family == AF_INET6);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
connected = 0;
@@ -1013,13 +1018,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!rt) {
struct net *net = sock_net(sk);
+ __u8 flow_flags = inet_sk_flowi_flags(sk);
fl4 = &fl4_stack;
+
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk),
+ flow_flags,
faddr, saddr, dport, inet->inet_sport);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (err < 0)
+ goto out;
+ }
+
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/kernel/net/ipv4/udp_diag.c b/kernel/net/ipv4/udp_diag.c
index b763c39ae..6116604bf 100644
--- a/kernel/net/ipv4/udp_diag.c
+++ b/kernel/net/ipv4/udp_diag.c
@@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = {
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
+ .idiag_info_size = 0,
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = {
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
+ .idiag_info_size = 0,
};
static int __init udp_diag_init(void)
diff --git a/kernel/net/ipv4/udp_tunnel.c b/kernel/net/ipv4/udp_tunnel.c
index 6bb98cc19..aba428626 100644
--- a/kernel/net/ipv4/udp_tunnel.c
+++ b/kernel/net/ipv4/udp_tunnel.c
@@ -4,9 +4,10 @@
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <net/dst_metadata.h>
+#include <net/net_namespace.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
-#include <net/net_namespace.h>
int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
@@ -15,12 +16,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket *sock = NULL;
struct sockaddr_in udp_addr;
- err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
if (err < 0)
goto error;
- sk_change_net(sock->sk, net);
-
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -47,7 +46,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
error:
if (sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
*sockp = NULL;
return err;
@@ -101,8 +100,30 @@ void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
kernel_sock_shutdown(sock, SHUT_RDWR);
- sk_release_kernel(sock->sk);
+ sock_release(sock);
}
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
+ __be16 flags, __be64 tunnel_id, int md_size)
+{
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *info;
+
+ if (family == AF_INET)
+ tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ else
+ tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size);
+ if (!tun_dst)
+ return NULL;
+
+ info = &tun_dst->u.tun_info;
+ info->key.tp_src = udp_hdr(skb)->source;
+ info->key.tp_dst = udp_hdr(skb)->dest;
+ if (udp_hdr(skb)->check)
+ info->key.tun_flags |= TUNNEL_CSUM;
+ return tun_dst;
+}
+EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+
MODULE_LICENSE("GPL");
diff --git a/kernel/net/ipv4/xfrm4_input.c b/kernel/net/ipv4/xfrm4_input.c
index 60b032f58..62e1e72db 100644
--- a/kernel/net/ipv4/xfrm4_input.c
+++ b/kernel/net/ipv4/xfrm4_input.c
@@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
-static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
+static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
if (!skb_dst(skb)) {
const struct iphdr *iph = ip_hdr(skb);
@@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
return 0;
}
diff --git a/kernel/net/ipv4/xfrm4_output.c b/kernel/net/ipv4/xfrm4_output.c
index 2878dbfff..7ee6518af 100644
--- a/kernel/net/ipv4/xfrm4_output.c
+++ b/kernel/net/ipv4/xfrm4_output.c
@@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
mtu = dst_mtu(skb_dst(skb));
if (skb->len > mtu) {
+ skb->protocol = htons(ETH_P_IP);
+
if (skb->sk)
xfrm_local_error(skb, mtu);
else
@@ -80,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
return xfrm_output(sk, skb);
}
-static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
+static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct xfrm_state *x = skb_dst(skb)->xfrm;
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, skb_dst(skb)->dev, __xfrm4_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/kernel/net/ipv4/xfrm4_policy.c b/kernel/net/ipv4/xfrm4_policy.c
index bff69746e..7b0edb37a 100644
--- a/kernel/net/ipv4/xfrm4_policy.c
+++ b/kernel/net/ipv4/xfrm4_policy.c
@@ -15,11 +15,12 @@
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/l3mdev.h>
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
- int tos,
+ int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
{
@@ -28,9 +29,12 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
+ fl4->flowi4_oif = oif;
if (saddr)
fl4->saddr = saddr->a4;
+ fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
+
rt = __ip_route_output_key(net, fl4);
if (!IS_ERR(rt))
return &rt->dst;
@@ -38,22 +42,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
return ERR_CAST(rt);
}
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr)
{
struct flowi4 fl4;
- return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+ return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
}
-static int xfrm4_get_saddr(struct net *net,
+static int xfrm4_get_saddr(struct net *net, int oif,
xfrm_address_t *saddr, xfrm_address_t *daddr)
{
struct dst_entry *dst;
struct flowi4 fl4;
- dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
if (IS_ERR(dst))
return -EHOSTUNREACH;
@@ -93,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ xdst->u.rt.rt_table_id = rt->rt_table_id;
INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
@@ -107,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
int oif = 0;
if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
+ oif = l3mdev_fib_oif(skb_dst(skb)->dev);
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
@@ -122,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_DCCP:
if (xprth + 4 < skb->data ||
pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ports = (__be16 *)xprth;
+ __be16 *ports;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ports = (__be16 *)xprth;
fl4->fl4_sport = ports[!!reverse];
fl4->fl4_dport = ports[!reverse];
@@ -130,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ICMP:
- if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
- u8 *icmp = xprth;
+ if (xprth + 2 < skb->data ||
+ pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ icmp = xprth;
fl4->fl4_icmp_type = icmp[0];
fl4->fl4_icmp_code = icmp[1];
@@ -139,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ESP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be32 *ehdr = (__be32 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ehdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ehdr[0];
}
break;
case IPPROTO_AH:
- if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32 *)xprth;
+ if (xprth + 8 < skb->data ||
+ pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ah_hdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ah_hdr[1];
}
break;
case IPPROTO_COMP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ipcomp_hdr = (__be16 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ipcomp_hdr = (__be16 *)xprth;
fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
}
break;
case IPPROTO_GRE:
- if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
- __be16 *greflags = (__be16 *)xprth;
- __be32 *gre_hdr = (__be32 *)xprth;
+ if (xprth + 12 < skb->data ||
+ pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags;
+ __be32 *gre_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ greflags = (__be16 *)xprth;
+ gre_hdr = (__be32 *)xprth;
if (greflags[0] & GRE_KEY) {
if (greflags[0] & GRE_CSUM)
@@ -230,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm4_dst_ops = {
+static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
@@ -239,12 +268,12 @@ static struct dst_ops xfrm4_dst_ops = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 32768,
+ .gc_thresh = INT_MAX,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
+ .dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.decode_session = _decode_session4,
@@ -266,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_init(struct net *net)
+static int __net_init xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -294,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_exit(struct net *net)
+static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -306,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
static struct pernet_operations __net_initdata xfrm4_net_ops = {
.init = xfrm4_net_init,
.exit = xfrm4_net_exit,
};
-#endif
static void __init xfrm4_policy_init(void)
{
@@ -320,13 +381,9 @@ static void __init xfrm4_policy_init(void)
void __init xfrm4_init(void)
{
- dst_entries_init(&xfrm4_dst_ops);
-
xfrm4_state_init();
xfrm4_policy_init();
xfrm4_protocol_init();
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
-#endif
}