summaryrefslogtreecommitdiffstats
path: root/kernel/net/netfilter
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/net/netfilter
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/net/netfilter')
-rw-r--r--kernel/net/netfilter/Kconfig52
-rw-r--r--kernel/net/netfilter/Makefile3
-rw-r--r--kernel/net/netfilter/core.c233
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_gen.h61
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_ip.c58
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c119
-rw-r--r--kernel/net/netfilter/ipset/ip_set_bitmap_port.c45
-rw-r--r--kernel/net/netfilter/ipset/ip_set_core.c404
-rw-r--r--kernel/net/netfilter/ipset/ip_set_getport.c19
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_gen.h756
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ip.c72
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipmark.c87
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipport.c98
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipportip.c91
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c96
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_mac.c30
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_net.c73
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netiface.c250
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netnet.c158
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netport.c86
-rw-r--r--kernel/net/netfilter/ipset/ip_set_hash_netportnet.c188
-rw-r--r--kernel/net/netfilter/ipset/ip_set_list_set.c427
-rw-r--r--kernel/net/netfilter/ipset/pfxlen.c16
-rw-r--r--kernel/net/netfilter/ipvs/Kconfig11
-rw-r--r--kernel/net/netfilter/ipvs/Makefile1
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_app.c36
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_conn.c91
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_core.c566
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ctl.c502
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_est.c20
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ftp.c27
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_lblc.c3
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_lblcr.c3
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_nfct.c5
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_ovf.c86
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_pe_sip.c2
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto.c33
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c32
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c58
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c61
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_proto_udp.c49
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sched.c14
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sh.c45
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_sync.c374
-rw-r--r--kernel/net/netfilter/ipvs/ip_vs_xmit.c145
-rw-r--r--kernel/net/netfilter/nf_conntrack_core.c197
-rw-r--r--kernel/net/netfilter/nf_conntrack_expect.c22
-rw-r--r--kernel/net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--kernel/net/netfilter/nf_conntrack_labels.c34
-rw-r--r--kernel/net/netfilter/nf_conntrack_netlink.c331
-rw-r--r--kernel/net/netfilter/nf_conntrack_pptp.c3
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_dccp.c2
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_generic.c10
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_gre.c3
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_sctp.c103
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_tcp.c2
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_udp.c1
-rw-r--r--kernel/net/netfilter/nf_conntrack_proto_udplite.c1
-rw-r--r--kernel/net/netfilter/nf_conntrack_seqadj.c9
-rw-r--r--kernel/net/netfilter/nf_conntrack_standalone.c39
-rw-r--r--kernel/net/netfilter/nf_internals.h1
-rw-r--r--kernel/net/netfilter/nf_log.c9
-rw-r--r--kernel/net/netfilter/nf_nat_core.c28
-rw-r--r--kernel/net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_udp.c2
-rw-r--r--kernel/net/netfilter/nf_nat_proto_udplite.c2
-rw-r--r--kernel/net/netfilter/nf_nat_redirect.c2
-rw-r--r--kernel/net/netfilter/nf_queue.c55
-rw-r--r--kernel/net/netfilter/nf_synproxy_core.c24
-rw-r--r--kernel/net/netfilter/nf_tables_api.c221
-rw-r--r--kernel/net/netfilter/nf_tables_core.c10
-rw-r--r--kernel/net/netfilter/nf_tables_netdev.c256
-rw-r--r--kernel/net/netfilter/nfnetlink.c54
-rw-r--r--kernel/net/netfilter/nfnetlink_acct.c71
-rw-r--r--kernel/net/netfilter/nfnetlink_cttimeout.c34
-rw-r--r--kernel/net/netfilter/nfnetlink_log.c91
-rw-r--r--kernel/net/netfilter/nfnetlink_queue.c (renamed from kernel/net/netfilter/nfnetlink_queue_core.c)144
-rw-r--r--kernel/net/netfilter/nfnetlink_queue_ct.c113
-rw-r--r--kernel/net/netfilter/nft_compat.c26
-rw-r--r--kernel/net/netfilter/nft_counter.c124
-rw-r--r--kernel/net/netfilter/nft_ct.c1
-rw-r--r--kernel/net/netfilter/nft_dynset.c5
-rw-r--r--kernel/net/netfilter/nft_limit.c188
-rw-r--r--kernel/net/netfilter/nft_log.c3
-rw-r--r--kernel/net/netfilter/nft_meta.c44
-rw-r--r--kernel/net/netfilter/nft_payload.c57
-rw-r--r--kernel/net/netfilter/nft_queue.c2
-rw-r--r--kernel/net/netfilter/nft_reject_inet.c19
-rw-r--r--kernel/net/netfilter/x_tables.c85
-rw-r--r--kernel/net/netfilter/xt_CT.c46
-rw-r--r--kernel/net/netfilter/xt_IDLETIMER.c1
-rw-r--r--kernel/net/netfilter/xt_LOG.c2
-rw-r--r--kernel/net/netfilter/xt_NFLOG.c2
-rw-r--r--kernel/net/netfilter/xt_TCPMSS.c16
-rw-r--r--kernel/net/netfilter/xt_TCPOPTSTRIP.c2
-rw-r--r--kernel/net/netfilter/xt_TEE.c168
-rw-r--r--kernel/net/netfilter/xt_TPROXY.c30
-rw-r--r--kernel/net/netfilter/xt_addrtype.c6
-rw-r--r--kernel/net/netfilter/xt_connlabel.c16
-rw-r--r--kernel/net/netfilter/xt_connlimit.c13
-rw-r--r--kernel/net/netfilter/xt_ipvs.c5
-rw-r--r--kernel/net/netfilter/xt_mark.c1
-rw-r--r--kernel/net/netfilter/xt_nfacct.c2
-rw-r--r--kernel/net/netfilter/xt_osf.c2
-rw-r--r--kernel/net/netfilter/xt_owner.c6
-rw-r--r--kernel/net/netfilter/xt_recent.c2
-rw-r--r--kernel/net/netfilter/xt_set.c47
-rw-r--r--kernel/net/netfilter/xt_socket.c73
109 files changed, 4975 insertions, 3487 deletions
diff --git a/kernel/net/netfilter/Kconfig b/kernel/net/netfilter/Kconfig
index a0f3e6a3c..4692782b5 100644
--- a/kernel/net/netfilter/Kconfig
+++ b/kernel/net/netfilter/Kconfig
@@ -1,6 +1,14 @@
menu "Core Netfilter Configuration"
depends on NET && INET && NETFILTER
+config NETFILTER_INGRESS
+ bool "Netfilter ingress support"
+ default y
+ select NET_INGRESS
+ help
+ This allows you to classify packets from ingress using the Netfilter
+ infrastructure.
+
config NETFILTER_NETLINK
tristate
@@ -198,7 +206,7 @@ config NF_CONNTRACK_FTP
config NF_CONNTRACK_H323
tristate "H.323 protocol support"
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on NETFILTER_ADVANCED
help
H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -346,7 +354,7 @@ config NF_CT_NETLINK_HELPER
select NETFILTER_NETLINK
depends on NF_CT_NETLINK
depends on NETFILTER_NETLINK_QUEUE
- depends on NETFILTER_NETLINK_QUEUE_CT
+ depends on NETFILTER_NETLINK_GLUE_CT
depends on NETFILTER_ADVANCED
help
This option enables the user-space connection tracking helpers
@@ -354,13 +362,14 @@ config NF_CT_NETLINK_HELPER
If unsure, say `N'.
-config NETFILTER_NETLINK_QUEUE_CT
- bool "NFQUEUE integration with Connection Tracking"
- default n
- depends on NETFILTER_NETLINK_QUEUE
+config NETFILTER_NETLINK_GLUE_CT
+ bool "NFQUEUE and NFLOG integration with Connection Tracking"
+ default n
+ depends on (NETFILTER_NETLINK_QUEUE || NETFILTER_NETLINK_LOG) && NF_CT_NETLINK
help
- If this option is enabled, NFQUEUE can include Connection Tracking
- information together with the packet is the enqueued via NFNETLINK.
+ If this option is enabled, NFQUEUE and NFLOG can include
+ Connection Tracking information together with the packet is
+ the enqueued via NFNETLINK.
config NF_NAT
tristate
@@ -448,6 +457,11 @@ config NF_TABLES_INET
help
This option enables support for a mixed IPv4/IPv6 "inet" table.
+config NF_TABLES_NETDEV
+ tristate "Netfilter nf_tables netdev tables support"
+ help
+ This option enables support for the "netdev" table.
+
config NFT_EXTHDR
tristate "Netfilter nf_tables IPv6 exthdr module"
help
@@ -710,7 +724,7 @@ config NETFILTER_XT_TARGET_HL
config NETFILTER_XT_TARGET_HMARK
tristate '"HMARK" target support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
---help---
This option adds the "HMARK" target.
@@ -852,8 +866,10 @@ config NETFILTER_XT_TARGET_REDIRECT
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
+ select NF_DUP_IPV4
+ select NF_DUP_IPV6 if IP6_NF_IPTABLES != n
---help---
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -862,11 +878,11 @@ config NETFILTER_XT_TARGET_TPROXY
tristate '"TPROXY" target transparent proxying support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IPV6 || IPV6=n
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -902,7 +918,7 @@ config NETFILTER_XT_TARGET_SECMARK
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
---help---
This option adds a `TCPMSS' target, which allows you to alter the
@@ -1114,7 +1130,7 @@ config NETFILTER_XT_MATCH_ESP
config NETFILTER_XT_MATCH_HASHLIMIT
tristate '"hashlimit" match support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
help
This option adds a `hashlimit' match.
@@ -1356,10 +1372,10 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
- depends on (IPV6 || IPV6=n)
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IPV6 || IPV6=n
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `socket' match, which can be used to match
packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/kernel/net/netfilter/Makefile b/kernel/net/netfilter/Makefile
index a87d8b8ec..7638c36b4 100644
--- a/kernel/net/netfilter/Makefile
+++ b/kernel/net/netfilter/Makefile
@@ -10,8 +10,6 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
-nfnetlink_queue-y := nfnetlink_queue_core.o
-nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
@@ -75,6 +73,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
+obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
obj-$(CONFIG_NFT_META) += nft_meta.o
diff --git a/kernel/net/netfilter/core.c b/kernel/net/netfilter/core.c
index f0adf700b..10880c89d 100644
--- a/kernel/net/netfilter/core.c
+++ b/kernel/net/netfilter/core.c
@@ -40,6 +40,9 @@ EXPORT_SYMBOL(nf_afinfo);
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
+DEFINE_PER_CPU(bool, nf_skb_duplicated);
+EXPORT_SYMBOL_GPL(nf_skb_duplicated);
+
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
mutex_lock(&afinfo_mutex);
@@ -58,9 +61,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
}
EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
#ifdef HAVE_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -68,33 +68,168 @@ EXPORT_SYMBOL(nf_hooks_needed);
static DEFINE_MUTEX(nf_hook_mutex);
-int nf_register_hook(struct nf_hook_ops *reg)
+static struct list_head *nf_find_hook_list(struct net *net,
+ const struct nf_hook_ops *reg)
+{
+ struct list_head *hook_list = NULL;
+
+ if (reg->pf != NFPROTO_NETDEV)
+ hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+ else if (reg->hooknum == NF_NETDEV_INGRESS) {
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->dev && dev_net(reg->dev) == net)
+ hook_list = &reg->dev->nf_hooks_ingress;
+#endif
+ }
+ return hook_list;
+}
+
+struct nf_hook_entry {
+ const struct nf_hook_ops *orig_ops;
+ struct nf_hook_ops ops;
+};
+
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
struct nf_hook_ops *elem;
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->orig_ops = reg;
+ entry->ops = *reg;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list) {
+ kfree(entry);
+ return -ENOENT;
+ }
+
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+ list_for_each_entry(elem, hook_list, list) {
if (reg->priority < elem->priority)
break;
}
- list_add_rcu(&reg->list, elem->list.prev);
+ list_add_rcu(&entry->ops.list, elem->list.prev);
mutex_unlock(&nf_hook_mutex);
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_inc_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
return 0;
}
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
+ struct list_head *hook_list;
+ struct nf_hook_entry *entry;
+ struct nf_hook_ops *elem;
+
+ hook_list = nf_find_hook_list(net, reg);
+ if (!hook_list)
+ return;
+
mutex_lock(&nf_hook_mutex);
- list_del_rcu(&reg->list);
+ list_for_each_entry(elem, hook_list, list) {
+ entry = container_of(elem, struct nf_hook_entry, ops);
+ if (entry->orig_ops == reg) {
+ list_del_rcu(&entry->ops.list);
+ break;
+ }
+ }
mutex_unlock(&nf_hook_mutex);
+ if (&elem->list == hook_list) {
+ WARN(1, "nf_unregister_net_hook: hook not found!\n");
+ return;
+ }
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_dec_ingress_queue();
+#endif
#ifdef HAVE_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
+ nf_queue_nf_hook_drop(net, &entry->ops);
+ /* other cpu might still process nfqueue verdict that used reg */
+ synchronize_net();
+ kfree(entry);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = nf_register_net_hook(net, &reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ nf_unregister_net_hooks(net, reg, i);
+ return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+ unsigned int n)
+{
+ while (n-- > 0)
+ nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct net *net, *last;
+ int ret;
+
+ rtnl_lock();
+ for_each_net(net) {
+ ret = nf_register_net_hook(net, reg);
+ if (ret && ret != -ENOENT)
+ goto rollback;
+ }
+ list_add_tail(&reg->list, &nf_hook_list);
+ rtnl_unlock();
+
+ return 0;
+rollback:
+ last = net;
+ for_each_net(net) {
+ if (net == last)
+ break;
+ nf_unregister_net_hook(net, reg);
+ }
+ rtnl_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_del(&reg->list);
+ for_each_net(net)
+ nf_unregister_net_hook(net, reg);
+ rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -142,7 +277,7 @@ unsigned int nf_iterate(struct list_head *head,
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
repeat:
- verdict = (*elemp)->hook(*elemp, skb, state);
+ verdict = (*elemp)->hook((*elemp)->priv, skb, state);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
@@ -172,11 +307,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- elem = list_entry_rcu(&nf_hooks[state->pf][state->hook],
- struct nf_hook_ops, list);
+ elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
next_hook:
- verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state,
- &elem);
+ verdict = nf_iterate(state->hook_list, skb, state, &elem);
if (verdict == NF_ACCEPT || verdict == NF_STOP) {
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
@@ -188,8 +321,6 @@ next_hook:
int err = nf_queue(skb, elem, state,
verdict >> NF_VERDICT_QBITS);
if (err < 0) {
- if (err == -ECANCELED)
- goto next_hook;
if (err == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
goto next_hook;
@@ -223,6 +354,12 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
}
EXPORT_SYMBOL(skb_make_writable);
+/* This needs to be compiled in any case to avoid dependencies between the
+ * nfnetlink_queue code and nf_conntrack.
+ */
+struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfnl_ct_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
@@ -260,12 +397,12 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_conntrack_destroy);
-struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
-EXPORT_SYMBOL_GPL(nfq_ct_hook);
-
-struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
-EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
-
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+ .id = NF_CT_DEFAULT_ZONE_ID,
+ .dir = NF_CT_DEFAULT_ZONE_DIR,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */
#ifdef CONFIG_NF_NAT_NEEDED
@@ -273,8 +410,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
EXPORT_SYMBOL(nf_nat_decode_session_hook);
#endif
+static int nf_register_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+ int ret;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list) {
+ ret = nf_register_net_hook(net, elem);
+ if (ret && ret != -ENOENT)
+ goto out_undo;
+ }
+ rtnl_unlock();
+ return 0;
+
+out_undo:
+ list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+ return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+ struct nf_hook_ops *elem;
+
+ rtnl_lock();
+ list_for_each_entry(elem, &nf_hook_list, list)
+ nf_unregister_net_hook(net, elem);
+ rtnl_unlock();
+}
+
static int __net_init netfilter_net_init(struct net *net)
{
+ int i, h, ret;
+
+ for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+ }
+
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
@@ -285,11 +460,16 @@ static int __net_init netfilter_net_init(struct net *net)
return -ENOMEM;
}
#endif
- return 0;
+ ret = nf_register_hook_list(net);
+ if (ret)
+ remove_proc_entry("netfilter", net->proc_net);
+
+ return ret;
}
static void __net_exit netfilter_net_exit(struct net *net)
{
+ nf_unregister_hook_list(net);
remove_proc_entry("netfilter", net->proc_net);
}
@@ -300,12 +480,7 @@ static struct pernet_operations netfilter_net_ops = {
int __init netfilter_init(void)
{
- int i, h, ret;
-
- for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
- for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&nf_hooks[i][h]);
- }
+ int ret;
ret = register_pernet_subsys(&netfilter_net_ops);
if (ret < 0)
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
index 6f024a8a1..b0bc475f6 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -33,7 +33,7 @@
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype MTYPE
-#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
static void
mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
@@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct mtype *map = set->data;
init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
+ map->gc.data = (unsigned long)set;
map->gc.function = gc;
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set)
del_timer_sync(&map->gc);
ip_set_free(map->members);
- if (set->dsize) {
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set);
- ip_set_free(map->extensions);
- }
- kfree(map);
+ if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map);
set->data = NULL;
}
@@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
{
const struct mtype *map = set->data;
struct nlattr *nested;
+ size_t memsize = sizeof(*map) + map->memsize;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) +
- map->memsize +
- set->dsize * map->elements)))
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -144,10 +139,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (ret == IPSET_ADD_FAILED) {
if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(x, set)))
+ ip_set_timeout_expired(ext_timeout(x, set))) {
ret = 0;
- else if (!(flags & IPSET_FLAG_EXIST))
+ } else if (!(flags & IPSET_FLAG_EXIST)) {
+ set_bit(e->id, map->members);
return -IPSET_ERR_EXIST;
+ }
/* Element is re-added, cleanup extensions */
ip_set_ext_destroy(set, x);
}
@@ -165,6 +162,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ip_set_init_comment(ext_comment(x, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
+
+ /* Activate element */
+ set_bit(e->id, map->members);
+
return 0;
}
@@ -203,10 +204,13 @@ mtype_list(const struct ip_set *set,
struct nlattr *adt, *nested;
void *x;
u32 id, first = cb->args[IPSET_CB_ARG0];
+ int ret = 0;
adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!adt)
return -EMSGSIZE;
+ /* Extensions may be replaced */
+ rcu_read_lock();
for (; cb->args[IPSET_CB_ARG0] < map->elements;
cb->args[IPSET_CB_ARG0]++) {
id = cb->args[IPSET_CB_ARG0];
@@ -214,7 +218,7 @@ mtype_list(const struct ip_set *set,
if (!test_bit(id, map->members) ||
(SET_WITH_TIMEOUT(set) &&
#ifdef IP_SET_BITMAP_STORED_TIMEOUT
- mtype_is_filled((const struct mtype_elem *) x) &&
+ mtype_is_filled((const struct mtype_elem *)x) &&
#endif
ip_set_timeout_expired(ext_timeout(x, set))))
continue;
@@ -222,14 +226,16 @@ mtype_list(const struct ip_set *set,
if (!nested) {
if (id == first) {
nla_nest_cancel(skb, adt);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ goto nla_put_failure;
}
if (mtype_do_list(skb, map, id, set->dsize))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, x,
- mtype_is_filled((const struct mtype_elem *) x)))
+ mtype_is_filled((const struct mtype_elem *)x)))
goto nla_put_failure;
ipset_nest_end(skb, nested);
}
@@ -238,29 +244,32 @@ mtype_list(const struct ip_set *set,
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(id == first)) {
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
}
ipset_nest_end(skb, adt);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static void
mtype_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct mtype *map = set->data;
void *x;
u32 id;
/* We run parallel with other readers (test element)
- * but adding/deleting new entries is locked out */
- read_lock_bh(&set->lock);
+ * but adding/deleting new entries is locked out
+ */
+ spin_lock_bh(&set->lock);
for (id = 0; id < map->elements; id++)
if (mtype_gc_test(id, map, set->dsize)) {
x = get_ext(set, map, id);
@@ -269,7 +278,7 @@ mtype_gc(unsigned long ul_set)
ip_set_ext_destroy(set, x);
}
}
- read_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
index 55b083ec5..4783efff0 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -36,11 +36,11 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
#define MTYPE bitmap_ip
+#define HOST_MASK 32
/* Type structure */
struct bitmap_ip {
void *members; /* the set members */
- void *extensions; /* data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -48,6 +48,8 @@ struct bitmap_ip {
size_t memsize; /* members size */
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -58,7 +60,7 @@ struct bitmap_ip_adt_elem {
static inline u32
ip_to_id(const struct bitmap_ip *m, u32 ip)
{
- return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts;
}
/* Common functions */
@@ -80,7 +82,7 @@ static inline int
bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
u32 flags, size_t dsize)
{
- return !!test_and_set_bit(e->id, map->members);
+ return !!test_bit(e->id, map->members);
}
static inline int
@@ -137,20 +139,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -174,11 +173,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
- } else
+ } else {
ip_to = ip;
+ }
if (ip_to > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
@@ -189,8 +189,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -225,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -277,16 +270,17 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (cidr >= 32)
+ if (cidr >= HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(first_ip, last_ip, cidr);
- } else
+ } else {
return -IPSET_ERR_PROTOCOL;
+ }
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
- if (netmask > 32)
+ if (netmask > HOST_MASK)
return -IPSET_ERR_INVALID_NETMASK;
first_ip &= ip_set_hostmask(netmask);
@@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
pr_debug("hosts %u, elements %llu\n",
hosts, (unsigned long long)elements);
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ip;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
kfree(map);
@@ -360,7 +354,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -377,6 +372,7 @@ bitmap_ip_init(void)
static void __exit
bitmap_ip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_ip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 86104744b..29dde2083 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
#define MTYPE bitmap_ipmac
+#define HOST_MASK 32
#define IP_SET_BITMAP_STORED_TIMEOUT
enum {
@@ -46,24 +47,26 @@ enum {
/* Type structure */
struct bitmap_ipmac {
void *members; /* the set members */
- void *extensions; /* MAC + data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
+ unsigned char extensions[0] /* MAC + data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
struct bitmap_ipmac_adt_elem {
+ unsigned char ether[ETH_ALEN] __aligned(2);
u16 id;
- unsigned char *ether;
+ u16 add_mac;
};
struct bitmap_ipmac_elem {
unsigned char ether[ETH_ALEN];
unsigned char filled;
-} __attribute__ ((aligned));
+} __aligned(__alignof__(u64));
static inline u32
ip_to_id(const struct bitmap_ipmac *m, u32 ip)
@@ -71,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip)
return ip - m->first_ip;
}
-static inline struct bitmap_ipmac_elem *
-get_elem(void *extensions, u16 id, size_t dsize)
-{
- return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
-}
+#define get_elem(extensions, id, dsize) \
+ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
+
+#define get_const_elem(extensions, id, dsize) \
+ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
/* Common functions */
@@ -87,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
if (!test_bit(e->id, map->members))
return 0;
- elem = get_elem(map->extensions, e->id, dsize);
- if (elem->filled == MAC_FILLED)
- return e->ether == NULL ||
- ether_addr_equal(e->ether, elem->ether);
+ elem = get_const_elem(map->extensions, e->id, dsize);
+ if (e->add_mac && elem->filled == MAC_FILLED)
+ return ether_addr_equal(e->ether, elem->ether);
/* Trigger kernel to fill out the ethernet address */
return -EAGAIN;
}
@@ -102,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
if (!test_bit(id, map->members))
return 0;
- elem = get_elem(map->extensions, id, dsize);
+ elem = get_const_elem(map->extensions, id, dsize);
/* Timer not started for the incomplete elements */
return elem->filled == MAC_FILLED;
}
@@ -130,8 +132,9 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
/* If MAC is unset yet, we store plain timeout value
* because the timer is not activated yet
* and we can reuse it later when MAC is filled out,
- * possibly by the kernel */
- if (e->ether)
+ * possibly by the kernel
+ */
+ if (e->add_mac)
ip_set_timeout_set(timeout, t);
else
*timeout = t;
@@ -146,28 +149,35 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac_elem *elem;
elem = get_elem(map->extensions, e->id, dsize);
- if (test_and_set_bit(e->id, map->members)) {
+ if (test_bit(e->id, map->members)) {
if (elem->filled == MAC_FILLED) {
- if (e->ether && (flags & IPSET_FLAG_EXIST))
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ if (e->add_mac &&
+ (flags & IPSET_FLAG_EXIST) &&
+ !ether_addr_equal(e->ether, elem->ether)) {
+ /* memcpy isn't atomic */
+ clear_bit(e->id, map->members);
+ smp_mb__after_atomic();
+ ether_addr_copy(elem->ether, e->ether);
+ }
return IPSET_ADD_FAILED;
- } else if (!e->ether)
+ } else if (!e->add_mac)
/* Already added without ethernet address */
return IPSET_ADD_FAILED;
/* Fill the MAC address and trigger the timer activation */
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ clear_bit(e->id, map->members);
+ smp_mb__after_atomic();
+ ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
return IPSET_ADD_START_STORED_TIMEOUT;
- } else if (e->ether) {
+ } else if (e->add_mac) {
/* We can store MAC too */
- memcpy(elem->ether, e->ether, ETH_ALEN);
+ ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
return 0;
- } else {
- elem->filled = MAC_UNSET;
- /* MAC is not stored yet, don't start timer */
- return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
static inline int
@@ -182,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
u32 id, size_t dsize)
{
const struct bitmap_ipmac_elem *elem =
- get_elem(map->extensions, id, dsize);
+ get_const_elem(map->extensions, id, dsize);
return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
htonl(map->first_ip + id)) ||
@@ -204,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
{
struct bitmap_ipmac *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct bitmap_ipmac_adt_elem e = { .id = 0 };
+ struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
u32 ip;
@@ -222,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
return -EINVAL;
e.id = ip_to_id(map, ip);
- e.ether = eth_hdr(skb)->h_source;
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -238,20 +248,17 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -259,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
e.id = ip_to_id(map, ip);
- if (tb[IPSET_ATTR_ETHER])
- e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
- else
- e.ether = NULL;
-
+ if (tb[IPSET_ATTR_ETHER]) {
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ e.add_mac = 1;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
@@ -294,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -343,25 +342,27 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (cidr >= 32)
+ if (cidr >= HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(first_ip, last_ip, cidr);
- } else
+ } else {
return -IPSET_ERR_PROTOCOL;
+ }
elements = (u64)last_ip - first_ip + 1;
if (elements > IPSET_BITMAP_MAX_RANGE + 1)
return -IPSET_ERR_BITMAP_RANGE_SIZE;
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem),
+ __alignof__(struct bitmap_ipmac_elem));
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ipmac;
- set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct bitmap_ipmac_elem));
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
return -ENOMEM;
@@ -397,7 +398,8 @@ static struct ip_set_type bitmap_ipmac_type = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -414,6 +416,7 @@ bitmap_ipmac_init(void)
static void __exit
bitmap_ipmac_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_ipmac_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
index 005dd3644..7f0c73335 100644
--- a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port");
/* Type structure */
struct bitmap_port {
void *members; /* the set members */
- void *extensions; /* data extensions */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -73,7 +74,7 @@ static inline int
bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map, u32 flags, size_t dsize)
{
- return !!test_and_set_bit(e->id, map->members);
+ return !!test_bit(e->id, map->members);
}
static inline int
@@ -136,19 +137,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
u16 port_to;
int ret = 0;
- if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
+ return -IPSET_ERR_PROTOCOL;
+
port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
if (port < map->first_port || port > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
@@ -168,8 +163,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
if (port < map->first_port)
return -IPSET_ERR_BITMAP_RANGE;
}
- } else
+ } else {
port_to = port;
+ }
if (port_to > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
@@ -180,8 +176,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -214,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * map->elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_port = first_port;
map->last_port = last_port;
set->timeout = IPSET_NO_TIMEOUT;
@@ -237,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
{
struct bitmap_port *map;
u16 first_port, last_port;
+ u32 elements;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -253,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
last_port = tmp;
}
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ elements = last_port - first_port + 1;
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
- map->elements = last_port - first_port + 1;
+ map->elements = elements;
map->memsize = bitmap_bytes(0, map->elements);
set->variant = &bitmap_port;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
return -ENOMEM;
@@ -294,7 +285,8 @@ static struct ip_set_type bitmap_port_type = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -311,6 +303,7 @@ bitmap_port_init(void)
static void __exit
bitmap_port_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&bitmap_port_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_core.c b/kernel/net/netfilter/ipset/ip_set_core.c
index d259da3ce..54f3d7cb2 100644
--- a/kernel/net/netfilter/ipset/ip_set_core.c
+++ b/kernel/net/netfilter/ipset/ip_set_core.c
@@ -32,8 +32,10 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */
struct ip_set_net {
struct ip_set * __rcu *ip_set_list; /* all individual sets */
ip_set_id_t ip_set_max; /* max number of sets */
- int is_deleted; /* deleted by ip_set_net_exit */
+ bool is_deleted; /* deleted by ip_set_net_exit */
+ bool is_destroyed; /* all sets are destroyed */
};
+
static int ip_set_net_id __read_mostly;
static inline struct ip_set_net *ip_set_pernet(struct net *net)
@@ -42,7 +44,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net)
}
#define IP_SET_INC 64
-#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
static unsigned int max_sets;
@@ -59,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
#define ip_set(inst, id) \
ip_set_dereference((inst)->ip_set_list)[id]
-/*
- * The set types are implemented in modules and registered set types
+/* The set types are implemented in modules and registered set types
* can be found in ip_set_type_list. Adding/deleting types is
* serialized by ip_set_type_mutex.
*/
@@ -85,7 +86,7 @@ find_set_type(const char *name, u8 family, u8 revision)
struct ip_set_type *type;
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name) &&
+ if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC) &&
revision >= type->revision_min &&
@@ -130,9 +131,10 @@ __find_set_type_get(const char *name, u8 family, u8 revision,
goto unlock;
}
/* Make sure the type is already loaded
- * but we don't support the revision */
+ * but we don't support the revision
+ */
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name)) {
+ if (STRNCMP(type->name, name)) {
err = -IPSET_ERR_FIND_TYPE;
goto unlock;
}
@@ -166,7 +168,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
*min = 255; *max = 0;
rcu_read_lock();
list_for_each_entry_rcu(type, &ip_set_type_list, list)
- if (STREQ(type->name, name) &&
+ if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC)) {
found = true;
@@ -208,15 +210,15 @@ ip_set_type_register(struct ip_set_type *type)
pr_warn("ip_set type %s, family %s with revision min %u already registered!\n",
type->name, family_name(type->family),
type->revision_min);
- ret = -EINVAL;
- goto unlock;
+ ip_set_type_unlock();
+ return -EINVAL;
}
list_add_rcu(&type->list, &ip_set_type_list);
pr_debug("type %s, family %s, revision %u:%u registered.\n",
type->name, family_name(type->family),
type->revision_min, type->revision_max);
-unlock:
ip_set_type_unlock();
+
return ret;
}
EXPORT_SYMBOL_GPL(ip_set_type_register);
@@ -230,12 +232,12 @@ ip_set_type_unregister(struct ip_set_type *type)
pr_warn("ip_set type %s, family %s with revision min %u not registered\n",
type->name, family_name(type->family),
type->revision_min);
- goto unlock;
+ ip_set_type_unlock();
+ return;
}
list_del_rcu(&type->list);
pr_debug("type %s, family %s with revision min %u unregistered.\n",
type->name, family_name(type->family), type->revision_min);
-unlock:
ip_set_type_unlock();
synchronize_rcu();
@@ -289,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
int
ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
{
- struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
@@ -306,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
int
ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
{
- struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
@@ -317,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
return -IPSET_ERR_PROTOCOL;
memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
- sizeof(struct in6_addr));
+ sizeof(struct in6_addr));
return 0;
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
@@ -362,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
}
size_t
-ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
+ size_t align)
{
enum ip_set_ext_id id;
- size_t offset = 0;
u32 cadt_flags = 0;
if (tb[IPSET_ATTR_CADT_FLAGS])
cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ if (!align)
+ align = 1;
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
- offset += ALIGN(len + offset, ip_set_extensions[id].align);
- set->offset[id] = offset;
+ len = ALIGN(len, ip_set_extensions[id].align);
+ set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
- offset += ip_set_extensions[id].len;
+ len += ip_set_extensions[id].len;
}
- return len + offset;
+ return ALIGN(len, align);
}
EXPORT_SYMBOL_GPL(ip_set_elem_len);
@@ -389,13 +393,22 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext *ext)
{
u64 fullmark;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ if (!SET_WITH_TIMEOUT(set))
return -IPSET_ERR_TIMEOUT;
ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
}
if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
- if (!(set->extensions & IPSET_EXT_COUNTER))
+ if (!SET_WITH_COUNTER(set))
return -IPSET_ERR_COUNTER;
if (tb[IPSET_ATTR_BYTES])
ext->bytes = be64_to_cpu(nla_get_be64(
@@ -405,25 +418,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
tb[IPSET_ATTR_PACKETS]));
}
if (tb[IPSET_ATTR_COMMENT]) {
- if (!(set->extensions & IPSET_EXT_COMMENT))
+ if (!SET_WITH_COMMENT(set))
return -IPSET_ERR_COMMENT;
ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
}
if (tb[IPSET_ATTR_SKBMARK]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
ext->skbmark = fullmark >> 32;
ext->skbmarkmask = fullmark & 0xffffffff;
}
if (tb[IPSET_ATTR_SKBPRIO]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
ext->skbprio = be32_to_cpu(nla_get_be32(
tb[IPSET_ATTR_SKBPRIO]));
}
if (tb[IPSET_ATTR_SKBQUEUE]) {
- if (!(set->extensions & IPSET_EXT_SKBINFO))
+ if (!SET_WITH_SKBINFO(set))
return -IPSET_ERR_SKBINFO;
ext->skbqueue = be16_to_cpu(nla_get_be16(
tb[IPSET_ATTR_SKBQUEUE]));
@@ -432,8 +445,32 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
}
EXPORT_SYMBOL_GPL(ip_set_get_extensions);
-/*
- * Creating/destroying/renaming/swapping affect the existence and
+int
+ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
+ const void *e, bool active)
+{
+ if (SET_WITH_TIMEOUT(set)) {
+ unsigned long *timeout = ext_timeout(e, set);
+
+ if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(active ? ip_set_timeout_get(timeout)
+ : *timeout)))
+ return -EMSGSIZE;
+ }
+ if (SET_WITH_COUNTER(set) &&
+ ip_set_put_counter(skb, ext_counter(e, set)))
+ return -EMSGSIZE;
+ if (SET_WITH_COMMENT(set) &&
+ ip_set_put_comment(skb, ext_comment(e, set)))
+ return -EMSGSIZE;
+ if (SET_WITH_SKBINFO(set) &&
+ ip_set_put_skbinfo(skb, ext_skbinfo(e, set)))
+ return -EMSGSIZE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_put_extensions);
+
+/* Creating/destroying/renaming/swapping affect the existence and
* the properties of a set. All of these can be executed from userspace
* only and serialized by the nfnl mutex indirectly from nfnetlink.
*
@@ -460,8 +497,7 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock);
}
-/*
- * Add, del and test set entries from kernel.
+/* Add, del and test set entries from kernel.
*
* The set behind the index must exist and must be referenced
* so it can't be destroyed (or changed) under our foot.
@@ -485,27 +521,26 @@ int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret = 0;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
@@ -524,20 +559,19 @@ int
ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
return ret;
}
@@ -547,27 +581,25 @@ int
ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
- struct ip_set *set = ip_set_rcu_get(
- dev_net(par->in ? par->in : par->out), index);
+ struct ip_set *set = ip_set_rcu_get(par->net, index);
int ret = 0;
- BUG_ON(set == NULL);
+ BUG_ON(!set);
pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension ||
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
return ret;
}
EXPORT_SYMBOL_GPL(ip_set_del);
-/*
- * Find set by name, reference it once. The reference makes sure the
+/* Find set by name, reference it once. The reference makes sure the
* thing pointed to, does not go away under our feet.
*
*/
@@ -581,7 +613,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
rcu_read_lock();
for (i = 0; i < inst->ip_set_max; i++) {
s = rcu_dereference(inst->ip_set_list)[i];
- if (s != NULL && STREQ(s->name, name)) {
+ if (s && STRNCMP(s->name, name)) {
__ip_set_get(s);
index = i;
*set = s;
@@ -594,8 +626,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
}
EXPORT_SYMBOL_GPL(ip_set_get_byname);
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
* reference count by 1. The caller shall not assume the index
* to be valid, after calling this function.
*
@@ -608,7 +639,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
rcu_read_lock();
set = rcu_dereference(inst->ip_set_list)[index];
- if (set != NULL)
+ if (set)
__ip_set_put(set);
rcu_read_unlock();
}
@@ -622,8 +653,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_put_byindex);
-/*
- * Get the name of a set behind a set index.
+/* Get the name of a set behind a set index.
* We assume the set is referenced, so it does exist and
* can't be destroyed. The set cannot be renamed due to
* the referencing either.
@@ -634,7 +664,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
{
const struct ip_set *set = ip_set_rcu_get(net, index);
- BUG_ON(set == NULL);
+ BUG_ON(!set);
BUG_ON(set->ref == 0);
/* Referenced, so it's safe */
@@ -642,13 +672,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
-/*
- * Routines to call by external subsystems, which do not
+/* Routines to call by external subsystems, which do not
* call nfnl_lock for us.
*/
-/*
- * Find set by index, reference it once. The reference makes sure the
+/* Find set by index, reference it once. The reference makes sure the
* thing pointed to, does not go away under our feet.
*
* The nfnl mutex is used in the function.
@@ -674,8 +702,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
}
EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
* reference count by 1. The caller shall not assume the index
* to be valid, after calling this function.
*
@@ -690,15 +717,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index)
nfnl_lock(NFNL_SUBSYS_IPSET);
if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
set = ip_set(inst, index);
- if (set != NULL)
+ if (set)
__ip_set_put(set);
}
nfnl_unlock(NFNL_SUBSYS_IPSET);
}
EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
-/*
- * Communication protocol with userspace over netlink.
+/* Communication protocol with userspace over netlink.
*
* The commands are serialized by the nfnl mutex.
*/
@@ -725,7 +751,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ if (!nlh)
return NULL;
nfmsg = nlmsg_data(nlh);
@@ -758,7 +784,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
*id = IPSET_INVALID_ID;
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
- if (set != NULL && STREQ(set->name, name)) {
+ if (set && STRNCMP(set->name, name)) {
*id = i;
break;
}
@@ -784,10 +810,10 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
*index = IPSET_INVALID_ID;
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s == NULL) {
+ if (!s) {
if (*index == IPSET_INVALID_ID)
*index = i;
- } else if (STREQ(name, s->name)) {
+ } else if (STRNCMP(name, s->name)) {
/* Name clash */
*set = s;
return -EEXIST;
@@ -816,18 +842,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
- struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
const char *name, *typename;
u8 family, revision;
u32 flags = flag_exist(nlh);
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_TYPENAME] == NULL ||
- attr[IPSET_ATTR_REVISION] == NULL ||
- attr[IPSET_ATTR_FAMILY] == NULL ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_TYPENAME] ||
+ !attr[IPSET_ATTR_REVISION] ||
+ !attr[IPSET_ATTR_FAMILY] ||
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA]))))
return -IPSET_ERR_PROTOCOL;
@@ -838,33 +864,29 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
name, typename, family_name(family), revision);
- /*
- * First, and without any locks, allocate and initialize
+ /* First, and without any locks, allocate and initialize
* a normal base set structure.
*/
- set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ set = kzalloc(sizeof(*set), GFP_KERNEL);
if (!set)
return -ENOMEM;
- rwlock_init(&set->lock);
+ spin_lock_init(&set->lock);
strlcpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
- /*
- * Next, check that we know the type, and take
+ /* Next, check that we know the type, and take
* a reference on the type, to make sure it stays available
* while constructing our new set.
*
* After referencing the type, we try to create the type
* specific part of the set without holding any locks.
*/
- ret = find_set_type_get(typename, family, revision, &(set->type));
+ ret = find_set_type_get(typename, family, revision, &set->type);
if (ret)
goto out;
- /*
- * Without holding any locks, create private part.
- */
+ /* Without holding any locks, create private part. */
if (attr[IPSET_ATTR_DATA] &&
nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
set->type->create_policy)) {
@@ -878,8 +900,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
/* BTW, ret==0 here. */
- /*
- * Here, we have a valid, constructed set and we are protected
+ /* Here, we have a valid, constructed set and we are protected
* by the nfnl mutex. Find the first free index in ip_set_list
* and check clashing.
*/
@@ -887,7 +908,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
if (ret == -EEXIST) {
/* If this is the same set and requested, ignore error */
if ((flags & IPSET_FLAG_EXIST) &&
- STREQ(set->type->name, clash->type->name) &&
+ STRNCMP(set->type->name, clash->type->name) &&
set->type->family == clash->type->family &&
set->type->revision_min == clash->type->revision_min &&
set->type->revision_max == clash->type->revision_max &&
@@ -902,7 +923,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
/* Wraparound */
goto cleanup;
- list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+ list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
@@ -916,12 +937,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
inst->ip_set_max = i;
kfree(tmp);
ret = 0;
- } else if (ret)
+ } else if (ret) {
goto cleanup;
+ }
- /*
- * Finally! Add our shiny new set to the list, and be done.
- */
+ /* Finally! Add our shiny new set to the list, and be done. */
pr_debug("create: '%s' created with index %u!\n", set->name, index);
ip_set(inst, index) = set;
@@ -946,12 +966,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
};
static void
-ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
+ip_set_destroy_set(struct ip_set *set)
{
- struct ip_set *set = ip_set(inst, index);
-
pr_debug("set: %s\n", set->name);
- ip_set(inst, index) = NULL;
/* Must call it without holding any lock */
set->variant->destroy(set);
@@ -986,30 +1003,36 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL && s->ref) {
+ if (s && s->ref) {
ret = -IPSET_ERR_BUSY;
goto out;
}
}
+ inst->is_destroyed = true;
read_unlock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL)
- ip_set_destroy_set(inst, i);
+ if (s) {
+ ip_set(inst, i) = NULL;
+ ip_set_destroy_set(s);
+ }
}
+ /* Modified by ip_set_destroy() only, which is serialized */
+ inst->is_destroyed = false;
} else {
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
- if (s == NULL) {
+ if (!s) {
ret = -ENOENT;
goto out;
} else if (s->ref) {
ret = -IPSET_ERR_BUSY;
goto out;
}
+ ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
- ip_set_destroy_set(inst, i);
+ ip_set_destroy_set(s);
}
return 0;
out:
@@ -1024,9 +1047,9 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set->variant->flush(set);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
}
static int
@@ -1044,12 +1067,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL)
+ if (s)
ip_set_flush_set(s);
}
} else {
s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (s == NULL)
+ if (!s)
return -ENOENT;
ip_set_flush_set(s);
@@ -1081,12 +1104,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_SETNAME2] == NULL))
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
read_lock_bh(&ip_set_ref_lock);
@@ -1098,7 +1121,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s != NULL && STREQ(s->name, name2)) {
+ if (s && STRNCMP(s->name, name2)) {
ret = -IPSET_ERR_EXIST_SETNAME2;
goto out;
}
@@ -1130,23 +1153,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
char from_name[IPSET_MAXNAMELEN];
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_SETNAME2] == NULL))
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_SETNAME2]))
return -IPSET_ERR_PROTOCOL;
from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&from_id);
- if (from == NULL)
+ if (!from)
return -ENOENT;
to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
&to_id);
- if (to == NULL)
+ if (!to)
return -IPSET_ERR_EXIST_SETNAME2;
/* Features must not change.
- * Not an artificial restriction anymore, as we must prevent
- * possible loops created by swapping in setlist type of sets. */
+ * Not an artifical restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets.
+ */
if (!(from->type->features == to->type->features &&
from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH;
@@ -1177,12 +1201,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
static int
ip_set_dump_done(struct netlink_callback *cb)
{
- struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
if (cb->args[IPSET_CB_ARG0]) {
- pr_debug("release set %s\n",
- ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
- __ip_set_put_byindex(inst,
- (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+ struct ip_set_net *inst =
+ (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
+ struct ip_set *set = ip_set(inst, index);
+
+ if (set->variant->uref)
+ set->variant->uref(set, cb, false);
+ pr_debug("release set %s\n", set->name);
+ __ip_set_put_byindex(inst, index);
}
return 0;
}
@@ -1204,7 +1232,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
ip_set_id_t index;
@@ -1213,27 +1241,23 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
nla_parse(cda, IPSET_ATTR_CMD_MAX,
attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
- /* cb->args[IPSET_CB_NET]: net namespace
- * [IPSET_CB_DUMP]: dump single set/all sets
- * [IPSET_CB_INDEX]: set index
- * [IPSET_CB_ARG0]: type specific
- */
-
if (cda[IPSET_ATTR_SETNAME]) {
struct ip_set *set;
set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
&index);
- if (set == NULL)
+ if (!set)
return -ENOENT;
dump_type = DUMP_ONE;
cb->args[IPSET_CB_INDEX] = index;
- } else
+ } else {
dump_type = DUMP_ALL;
+ }
if (cda[IPSET_ATTR_FLAGS]) {
u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+
dump_type |= (f << 16);
}
cb->args[IPSET_CB_NET] = (unsigned long)inst;
@@ -1251,6 +1275,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type, dump_flags;
+ bool is_destroyed;
int ret = 0;
if (!cb->args[IPSET_CB_DUMP]) {
@@ -1258,7 +1283,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
if (ret < 0) {
nlh = nlmsg_hdr(cb->skb);
/* We have to create and send the error message
- * manually :-( */
+ * manually :-(
+ */
if (nlh->nlmsg_flags & NLM_F_ACK)
netlink_ack(cb->skb, nlh, ret);
return ret;
@@ -1276,13 +1302,21 @@ dump_last:
pr_debug("dump type, flag: %u %u index: %ld\n",
dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
- index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+ index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
+ write_lock_bh(&ip_set_ref_lock);
set = ip_set(inst, index);
- if (set == NULL) {
+ is_destroyed = inst->is_destroyed;
+ if (!set || is_destroyed) {
+ write_unlock_bh(&ip_set_ref_lock);
if (dump_type == DUMP_ONE) {
ret = -ENOENT;
goto out;
}
+ if (is_destroyed) {
+ /* All sets are just being destroyed */
+ ret = 0;
+ goto out;
+ }
continue;
}
/* When dumping all sets, we must dump "sorted"
@@ -1290,14 +1324,17 @@ dump_last:
*/
if (dump_type != DUMP_ONE &&
((dump_type == DUMP_ALL) ==
- !!(set->type->features & IPSET_DUMP_LAST)))
+ !!(set->type->features & IPSET_DUMP_LAST))) {
+ write_unlock_bh(&ip_set_ref_lock);
continue;
+ }
pr_debug("List set: %s\n", set->name);
if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n");
- __ip_set_get(set);
+ set->ref++;
}
+ write_unlock_bh(&ip_set_ref_lock);
nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, flags,
IPSET_CMD_LIST);
@@ -1325,11 +1362,13 @@ dump_last:
goto release_refcount;
if (dump_flags & IPSET_FLAG_LIST_HEADER)
goto next_set;
+ if (set->variant->uref)
+ set->variant->uref(set, cb, true);
/* Fall through and add elements */
default:
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->list(set, skb, cb);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
if (!cb->args[IPSET_CB_ARG0])
/* Set is done, proceed with next one */
goto next_set;
@@ -1341,6 +1380,8 @@ dump_last:
dump_type = DUMP_LAST;
cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
cb->args[IPSET_CB_INDEX] = 0;
+ if (set && set->variant->uref)
+ set->variant->uref(set, cb, false);
goto dump_last;
}
goto out;
@@ -1355,7 +1396,10 @@ next_set:
release_refcount:
/* If there was an error or set is done, release set */
if (ret || !cb->args[IPSET_CB_ARG0]) {
- pr_debug("release set %s\n", ip_set(inst, index)->name);
+ set = ip_set(inst, index);
+ if (set->variant->uref)
+ set->variant->uref(set, cb, false);
+ pr_debug("release set %s\n", set->name);
__ip_set_put_byindex(inst, index);
cb->args[IPSET_CB_ARG0] = 0;
}
@@ -1407,9 +1451,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
retried = true;
} while (ret == -EAGAIN &&
set->variant->resize &&
@@ -1425,12 +1469,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
size_t payload = min(SIZE_MAX,
sizeof(*errmsg) + nlmsg_len(nlh));
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *cmdattr;
u32 *errline;
skb2 = nlmsg_new(payload, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
@@ -1447,7 +1491,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
/* Signal netlink not to send its ACK/errmsg. */
return -EINTR;
}
@@ -1462,25 +1507,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
u32 flags = flag_exist(nlh);
bool use_lineno;
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] != NULL &&
+ (attr[IPSET_ATTR_ADT] &&
(!flag_nested(attr[IPSET_ATTR_ADT]) ||
- attr[IPSET_ATTR_LINENO] == NULL))))
+ !attr[IPSET_ATTR_LINENO]))))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1517,25 +1562,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
u32 flags = flag_exist(nlh);
bool use_lineno;
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
!((attr[IPSET_ATTR_DATA] != NULL) ^
(attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] != NULL &&
+ (attr[IPSET_ATTR_DATA] &&
!flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] != NULL &&
+ (attr[IPSET_ATTR_ADT] &&
(!flag_nested(attr[IPSET_ATTR_ADT]) ||
- attr[IPSET_ATTR_LINENO] == NULL))))
+ !attr[IPSET_ATTR_LINENO]))))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1572,26 +1617,26 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
{
struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL ||
- attr[IPSET_ATTR_DATA] == NULL ||
+ !attr[IPSET_ATTR_SETNAME] ||
+ !attr[IPSET_ATTR_DATA] ||
!flag_nested(attr[IPSET_ATTR_DATA])))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
set->type->adt_policy))
return -IPSET_ERR_PROTOCOL;
- read_lock_bh(&set->lock);
+ rcu_read_lock_bh();
ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
- read_unlock_bh(&set->lock);
+ rcu_read_unlock_bh();
/* Userspace can't trigger element to be re-added */
if (ret == -EAGAIN)
ret = 1;
@@ -1613,15 +1658,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_SETNAME] == NULL))
+ !attr[IPSET_ATTR_SETNAME]))
return -IPSET_ERR_PROTOCOL;
set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (set == NULL)
+ if (!set)
return -ENOENT;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1670,8 +1715,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
int ret = 0;
if (unlikely(protocol_failed(attr) ||
- attr[IPSET_ATTR_TYPENAME] == NULL ||
- attr[IPSET_ATTR_FAMILY] == NULL))
+ !attr[IPSET_ATTR_TYPENAME] ||
+ !attr[IPSET_ATTR_FAMILY]))
return -IPSET_ERR_PROTOCOL;
family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
@@ -1681,7 +1726,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
return ret;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1726,11 +1771,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh2;
int ret = 0;
- if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
return -IPSET_ERR_PROTOCOL;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
+ if (!skb2)
return -ENOMEM;
nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1858,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
ret = -EFAULT;
goto done;
}
- op = (unsigned int *) data;
+ op = (unsigned int *)data;
if (*op < IP_SET_OP_VERSION) {
/* Check the version at the beginning of operations */
@@ -1970,10 +2015,11 @@ ip_set_net_init(struct net *net)
if (inst->ip_set_max >= IPSET_INVALID_ID)
inst->ip_set_max = IPSET_INVALID_ID - 1;
- list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+ list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
if (!list)
return -ENOMEM;
- inst->is_deleted = 0;
+ inst->is_deleted = false;
+ inst->is_destroyed = false;
rcu_assign_pointer(inst->ip_set_list, list);
return 0;
}
@@ -1986,12 +2032,14 @@ ip_set_net_exit(struct net *net)
struct ip_set *set = NULL;
ip_set_id_t i;
- inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+ inst->is_deleted = true; /* flag for ip_set_nfnl_put */
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
- if (set != NULL)
- ip_set_destroy_set(inst, i);
+ if (set) {
+ ip_set(inst, i) = NULL;
+ ip_set_destroy_set(set);
+ }
}
kfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
@@ -2003,11 +2051,11 @@ static struct pernet_operations ip_set_net_ops = {
.size = sizeof(struct ip_set_net)
};
-
static int __init
ip_set_init(void)
{
int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+
if (ret != 0) {
pr_err("ip_set: cannot register with nfnetlink.\n");
return ret;
diff --git a/kernel/net/netfilter/ipset/ip_set_getport.c b/kernel/net/netfilter/ipset/ip_set_getport.c
index 29fb01ddf..42c3e3ba1 100644
--- a/kernel/net/netfilter/ipset/ip_set_getport.c
+++ b/kernel/net/netfilter/ipset/ip_set_getport.c
@@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct tcphdr *th;
th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
- if (th == NULL)
+ if (!th)
/* No choice either */
return false;
@@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const sctp_sctphdr_t *sh;
sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
- if (sh == NULL)
+ if (!sh)
/* No choice either */
return false;
@@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct udphdr *uh;
uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
- if (uh == NULL)
+ if (!uh)
/* No choice either */
return false;
@@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct icmphdr *ic;
ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
- if (ic == NULL)
+ if (!ic)
return false;
*port = (__force __be16)htons((ic->type << 8) | ic->code);
@@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
const struct icmp6hdr *ic;
ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
- if (ic == NULL)
+ if (!ic)
return false;
*port = (__force __be16)
@@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
__be16 *port, u8 *proto)
{
const struct iphdr *iph = ip_hdr(skb);
- unsigned int protooff = ip_hdrlen(skb);
+ unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb);
int protocol = iph->protocol;
/* See comments at tcp_match in ip_tables.c */
@@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
return false;
default:
/* Other protocols doesn't have ports,
- so we can match fragments */
+ * so we can match fragments.
+ */
*proto = protocol;
return true;
}
@@ -135,7 +136,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
__be16 frag_off = 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ protoff = ipv6_skip_exthdr(skb,
+ skb_network_offset(skb) +
+ sizeof(struct ipv6hdr), &nexthdr,
&frag_off);
if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
return false;
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_gen.h b/kernel/net/netfilter/ipset/ip_set_hash_gen.h
index 974ff386d..e5336ab36 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/kernel/net/netfilter/ipset/ip_set_hash_gen.h
@@ -10,19 +10,19 @@
#include <linux/rcupdate.h>
#include <linux/jhash.h>
+#include <linux/types.h>
#include <linux/netfilter/ipset/ip_set_timeout.h>
-#ifndef rcu_dereference_bh
-#define rcu_dereference_bh(p) rcu_dereference(p)
-#endif
+
+#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
+#define ipset_dereference_protected(p, set) \
+ __ipset_dereference_protected(p, spin_is_locked(&(set)->lock))
#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
/* Hashing which uses arrays to resolve clashing. The hash table is resized
* (doubled) when searching becomes too long.
* Internally jhash is used with the assumption that the size of the
- * stored data is a multiple of sizeof(u32). If storage supports timeout,
- * the timeout field must be the last one in the data structure - that field
- * is ignored when computing the hash key.
+ * stored data is a multiple of sizeof(u32).
*
* Readers and resizing
*
@@ -35,7 +35,9 @@
/* Number of elements to store in an initial array block */
#define AHASH_INIT_SIZE 4
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+/* Max muber of elements in the array block when tuned */
+#define AHASH_MAX_TUNED 64
/* Max number of elements can be tuned */
#ifdef IP_SET_HASH_WITH_MULTI
@@ -53,8 +55,9 @@ tune_ahash_max(u8 curr, u32 multi)
/* Currently, at listing one hash bucket must fit into a message.
* Therefore we have a hard limit here.
*/
- return n > curr && n <= 64 ? n : curr;
+ return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
}
+
#define TUNE_AHASH_MAX(h, multi) \
((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
#else
@@ -64,18 +67,24 @@ tune_ahash_max(u8 curr, u32 multi)
/* A hash bucket */
struct hbucket {
- void *value; /* the array of the values */
+ struct rcu_head rcu; /* for call_rcu_bh */
+ /* Which positions are used in the array */
+ DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
+ unsigned char value[0] /* the array of the values */
+ __aligned(__alignof__(u64));
};
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
+ atomic_t ref; /* References for resizing */
+ atomic_t uref; /* References for dumping */
u8 htable_bits; /* size of hash table == 2^htable_bits */
- struct hbucket bucket[0]; /* hashtable buckets */
+ struct hbucket __rcu *bucket[0]; /* hashtable buckets */
};
-#define hbucket(h, i) (&((h)->bucket[i]))
+#define hbucket(h, i) ((h)->bucket[i])
#ifndef IPSET_NET_COUNT
#define IPSET_NET_COUNT 1
@@ -83,8 +92,8 @@ struct htable {
/* Book-keeping of the prefixes added to the set */
struct net_prefixes {
- u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */
- u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */
+ u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */
+ u8 cidr[IPSET_NET_COUNT]; /* the cidr value */
};
/* Compute the hash table size */
@@ -97,11 +106,11 @@ htable_size(u8 hbits)
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
- if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
- return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+ return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
/* Compute htable_bits from the user input parameter hashsize */
@@ -110,6 +119,7 @@ htable_bits(u32 hashsize)
{
/* Assume that hashsize == 2^htable_bits */
u8 bits = fls(hashsize - 1);
+
if (jhash_size(bits) != hashsize)
/* Round up to the first 2^n value */
bits = fls(hashsize);
@@ -117,30 +127,6 @@ htable_bits(u32 hashsize)
return bits;
}
-static int
-hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
-{
- if (n->pos >= n->size) {
- void *tmp;
-
- if (n->size >= ahash_max)
- /* Trigger rehashing */
- return -EAGAIN;
-
- tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
- if (!tmp)
- return -ENOMEM;
- if (n->size) {
- memcpy(tmp, n->value, n->size * dsize);
- kfree(n->value);
- }
- n->value = tmp;
- n->size += AHASH_INIT_SIZE;
- }
- return 0;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -149,23 +135,31 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#endif
/* cidr + 1 is stored in net_prefixes to support /0 */
-#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1)
+#define NCIDR_PUT(cidr) ((cidr) + 1)
+#define NCIDR_GET(cidr) ((cidr) - 1)
#ifdef IP_SET_HASH_WITH_NETS_PACKED
/* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */
-#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1)
-#define NCIDR(cidr) (cidr)
+#define DCIDR_PUT(cidr) ((cidr) - 1)
+#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1)
#else
-#define GCIDR(cidr, i) (__CIDR(cidr, i))
-#define NCIDR(cidr) (cidr - 1)
+#define DCIDR_PUT(cidr) (cidr)
+#define DCIDR_GET(cidr, i) __CIDR(cidr, i)
#endif
+#define INIT_CIDR(cidr, host_mask) \
+ DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
+
#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
#ifdef IP_SET_HASH_WITH_NET0
+/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
#define NLEN(family) (SET_HOST_MASK(family) + 1)
+#define CIDR_POS(c) ((c) - 1)
#else
+/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
#define NLEN(family) SET_HOST_MASK(family)
+#define CIDR_POS(c) ((c) - 2)
#endif
#else
@@ -180,6 +174,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_data_equal
#undef mtype_do_data_match
#undef mtype_data_set_flags
+#undef mtype_data_reset_elem
#undef mtype_data_reset_flags
#undef mtype_data_netmask
#undef mtype_data_list
@@ -193,7 +188,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_ahash_memsize
#undef mtype_flush
#undef mtype_destroy
-#undef mtype_gc_init
#undef mtype_same_set
#undef mtype_kadt
#undef mtype_uadt
@@ -203,6 +197,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#undef mtype_del
#undef mtype_test_cidrs
#undef mtype_test
+#undef mtype_uref
#undef mtype_expire
#undef mtype_resize
#undef mtype_head
@@ -227,6 +222,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+
#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
@@ -234,7 +230,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
-#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
@@ -244,23 +239,36 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_uref IPSET_TOKEN(MTYPE, _uref)
#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
+#ifndef MTYPE
+#error "MTYPE is not defined!"
+#endif
+
+#ifndef HOST_MASK
+#error "HOST_MASK is not defined!"
+#endif
+
#ifndef HKEY_DATALEN
#define HKEY_DATALEN sizeof(struct mtype_elem)
#endif
#define HKEY(data, initval, htable_bits) \
-(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \
& jhash_mask(htable_bits))
#ifndef htype
+#ifndef HTYPE
+#error "HTYPE is not defined!"
+#endif /* HTYPE */
#define htype HTYPE
/* The generic hash structure */
@@ -280,18 +288,16 @@ struct htype {
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
-#ifdef IP_SET_HASH_WITH_RBTREE
- struct rb_root rbtree;
-#endif
#ifdef IP_SET_HASH_WITH_NETS
struct net_prefixes nets[0]; /* book-keeping of prefixes */
#endif
};
-#endif
+#endif /* htype */
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
- * sized networks */
+ * sized networks. cidr == real cidr + 1 to support /0.
+ */
static void
mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
{
@@ -299,12 +305,12 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
/* Add in increasing prefix order, so larger cidr first */
for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
- if (j != -1)
+ if (j != -1) {
continue;
- else if (h->nets[i].cidr[n] < cidr)
+ } else if (h->nets[i].cidr[n] < cidr) {
j = i;
- else if (h->nets[i].cidr[n] == cidr) {
- h->nets[cidr - 1].nets[n]++;
+ } else if (h->nets[i].cidr[n] == cidr) {
+ h->nets[CIDR_POS(cidr)].nets[n]++;
return;
}
}
@@ -313,7 +319,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
}
h->nets[i].cidr[n] = cidr;
- h->nets[cidr - 1].nets[n] = 1;
+ h->nets[CIDR_POS(cidr)].nets[n] = 1;
}
static void
@@ -322,15 +328,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
u8 i, j, net_end = nets_length - 1;
for (i = 0; i < nets_length; i++) {
- if (h->nets[i].cidr[n] != cidr)
- continue;
- h->nets[cidr -1].nets[n]--;
- if (h->nets[cidr -1].nets[n] > 0)
- return;
+ if (h->nets[i].cidr[n] != cidr)
+ continue;
+ h->nets[CIDR_POS(cidr)].nets[n]--;
+ if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
+ return;
for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
- h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+ h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
h->nets[j].cidr[n] = 0;
- return;
+ return;
}
}
#endif
@@ -341,15 +347,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t,
u8 nets_length, size_t dsize)
{
u32 i;
- size_t memsize = sizeof(*h)
- + sizeof(*t)
+ struct hbucket *n;
+ size_t memsize = sizeof(*h) + sizeof(*t);
+
#ifdef IP_SET_HASH_WITH_NETS
- + sizeof(struct net_prefixes) * nets_length
+ memsize += sizeof(struct net_prefixes) * nets_length;
#endif
- + jhash_size(t->htable_bits) * sizeof(struct hbucket);
-
- for (i = 0; i < jhash_size(t->htable_bits); i++)
- memsize += t->bucket[i].size * dsize;
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = rcu_dereference_bh(hbucket(t, i));
+ if (!n)
+ continue;
+ memsize += sizeof(struct hbucket) + n->size * dsize;
+ }
return memsize;
}
@@ -364,7 +373,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
int i;
for (i = 0; i < n->pos; i++)
- ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+ if (test_bit(i, n->used))
+ ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
}
/* Flush a hash type of set: destroy all elements */
@@ -376,16 +386,16 @@ mtype_flush(struct ip_set *set)
struct hbucket *n;
u32 i;
- t = rcu_dereference_bh_nfnl(h->table);
+ t = ipset_dereference_protected(h->table, set);
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- if (n->size) {
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set, n);
- n->size = n->pos = 0;
- /* FIXME: use slab cache */
- kfree(n->value);
- }
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
}
#ifdef IP_SET_HASH_WITH_NETS
memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
@@ -401,13 +411,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- if (n->size) {
- if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
- mtype_ext_cleanup(set, n);
- /* FIXME: use slab cache */
- kfree(n->value);
- }
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ kfree(n);
}
ip_set_free(t);
@@ -419,13 +429,11 @@ mtype_destroy(struct ip_set *set)
{
struct htype *h = set->data;
- if (set->extensions & IPSET_EXT_TIMEOUT)
+ if (SET_WITH_TIMEOUT(set))
del_timer_sync(&h->gc);
- mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);
-#ifdef IP_SET_HASH_WITH_RBTREE
- rbtree_destroy(&h->rbtree);
-#endif
+ mtype_ahash_destroy(set,
+ __ipset_dereference_protected(h->table, 1), true);
kfree(h);
set->data = NULL;
@@ -437,7 +445,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct htype *h = set->data;
init_timer(&h->gc);
- h->gc.data = (unsigned long) set;
+ h->gc.data = (unsigned long)set;
h->gc.function = gc;
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&h->gc);
@@ -468,63 +476,78 @@ static void
mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
{
struct htable *t;
- struct hbucket *n;
+ struct hbucket *n, *tmp;
struct mtype_elem *data;
- u32 i;
- int j;
+ u32 i, j, d;
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ t = ipset_dereference_protected(h->table, set);
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = hbucket(t, i);
- for (j = 0; j < n->pos; j++) {
+ n = __ipset_dereference_protected(hbucket(t, i), 1);
+ if (!n)
+ continue;
+ for (j = 0, d = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used)) {
+ d++;
+ continue;
+ }
data = ahash_data(n, j, dsize);
if (ip_set_timeout_expired(ext_timeout(data, set))) {
pr_debug("expired %u/%u\n", i, j);
+ clear_bit(j, n->used);
+ smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h, SCIDR(data->cidr, k),
- nets_length, k);
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr,
+ k)),
+ nets_length, k);
#endif
ip_set_ext_destroy(set, data);
- if (j != n->pos - 1)
- /* Not last one */
- memcpy(data,
- ahash_data(n, n->pos - 1, dsize),
- dsize);
- n->pos--;
h->elements--;
+ d++;
}
}
- if (n->pos + AHASH_INIT_SIZE < n->size) {
- void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
- * dsize,
- GFP_ATOMIC);
+ if (d >= AHASH_INIT_SIZE) {
+ if (d >= n->size) {
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ continue;
+ }
+ tmp = kzalloc(sizeof(*tmp) +
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
/* Still try to delete expired elements */
continue;
- n->size -= AHASH_INIT_SIZE;
- memcpy(tmp, n->value, n->size * dsize);
- kfree(n->value);
- n->value = tmp;
+ tmp->size = n->size - AHASH_INIT_SIZE;
+ for (j = 0, d = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ memcpy(tmp->value + d * dsize, data, dsize);
+ set_bit(d, tmp->used);
+ d++;
+ }
+ tmp->pos = d;
+ rcu_assign_pointer(hbucket(t, i), tmp);
+ kfree_rcu(n, rcu);
}
}
- rcu_read_unlock_bh();
}
static void
mtype_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct htype *h = set->data;
pr_debug("called\n");
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
mtype_expire(set, h, NLEN(set->family), set->dsize);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&h->gc);
@@ -532,93 +555,152 @@ mtype_gc(unsigned long ul_set)
/* Resize a hash: create a new hash table with doubling the hashsize
* and inserting the elements to it. Repeat until we succeed or
- * fail due to memory pressures. */
+ * fail due to memory pressures.
+ */
static int
mtype_resize(struct ip_set *set, bool retried)
{
struct htype *h = set->data;
- struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);
- u8 htable_bits = orig->htable_bits;
+ struct htable *t, *orig;
+ u8 htable_bits;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
+ struct mtype_elem *tmp;
#endif
struct mtype_elem *data;
struct mtype_elem *d;
struct hbucket *n, *m;
- u32 i, j;
+ u32 i, j, key;
int ret;
- /* Try to cleanup once */
- if (SET_WITH_TIMEOUT(set) && !retried) {
- i = h->elements;
- write_lock_bh(&set->lock);
- mtype_expire(set, set->data, NLEN(set->family), set->dsize);
- write_unlock_bh(&set->lock);
- if (h->elements < i)
- return 0;
- }
+#ifdef IP_SET_HASH_WITH_NETS
+ tmp = kmalloc(dsize, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+#endif
+ rcu_read_lock_bh();
+ orig = rcu_dereference_bh_nfnl(h->table);
+ htable_bits = orig->htable_bits;
+ rcu_read_unlock_bh();
retry:
ret = 0;
htable_bits++;
- pr_debug("attempt to resize set %s from %u to %u, t %p\n",
- set->name, orig->htable_bits, htable_bits, orig);
if (!htable_bits) {
/* In case we have plenty of memory :-) */
pr_warn("Cannot increase the hashsize of set %s further\n",
set->name);
- return -IPSET_ERR_HASH_FULL;
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
+ }
+ t = ip_set_alloc(htable_size(htable_bits));
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
}
- t = ip_set_alloc(sizeof(*t)
- + jhash_size(htable_bits) * sizeof(struct hbucket));
- if (!t)
- return -ENOMEM;
t->htable_bits = htable_bits;
- read_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
+ orig = __ipset_dereference_protected(h->table, 1);
+ /* There can't be another parallel resizing, but dumping is possible */
+ atomic_set(&orig->ref, 1);
+ atomic_inc(&orig->uref);
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
for (i = 0; i < jhash_size(orig->htable_bits); i++) {
- n = hbucket(orig, i);
+ n = __ipset_dereference_protected(hbucket(orig, i), 1);
+ if (!n)
+ continue;
for (j = 0; j < n->pos; j++) {
- data = ahash_data(n, j, set->dsize);
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
#ifdef IP_SET_HASH_WITH_NETS
+ /* We have readers running parallel with us,
+ * so the live data cannot be modified.
+ */
flags = 0;
+ memcpy(tmp, data, dsize);
+ data = tmp;
mtype_data_reset_flags(data, &flags);
#endif
- m = hbucket(t, HKEY(data, h->initval, htable_bits));
- ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);
- if (ret < 0) {
-#ifdef IP_SET_HASH_WITH_NETS
- mtype_data_reset_flags(data, &flags);
-#endif
- read_unlock_bh(&set->lock);
- mtype_ahash_destroy(set, t, false);
- if (ret == -EAGAIN)
- goto retry;
- return ret;
+ key = HKEY(data, h->initval, htable_bits);
+ m = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!m) {
+ m = kzalloc(sizeof(*m) +
+ AHASH_INIT_SIZE * dsize,
+ GFP_ATOMIC);
+ if (!m) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ m->size = AHASH_INIT_SIZE;
+ RCU_INIT_POINTER(hbucket(t, key), m);
+ } else if (m->pos >= m->size) {
+ struct hbucket *ht;
+
+ if (m->size >= AHASH_MAX(h)) {
+ ret = -EAGAIN;
+ } else {
+ ht = kzalloc(sizeof(*ht) +
+ (m->size + AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!ht)
+ ret = -ENOMEM;
+ }
+ if (ret < 0)
+ goto cleanup;
+ memcpy(ht, m, sizeof(struct hbucket) +
+ m->size * dsize);
+ ht->size = m->size + AHASH_INIT_SIZE;
+ kfree(m);
+ m = ht;
+ RCU_INIT_POINTER(hbucket(t, key), ht);
}
- d = ahash_data(m, m->pos++, set->dsize);
- memcpy(d, data, set->dsize);
+ d = ahash_data(m, m->pos, dsize);
+ memcpy(d, data, dsize);
+ set_bit(m->pos++, m->used);
#ifdef IP_SET_HASH_WITH_NETS
mtype_data_reset_flags(d, &flags);
#endif
}
}
-
rcu_assign_pointer(h->table, t);
- read_unlock_bh(&set->lock);
+
+ spin_unlock_bh(&set->lock);
/* Give time to other readers of the set */
synchronize_rcu_bh();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
- mtype_ahash_destroy(set, orig, false);
+ /* If there's nobody else dumping the table, destroy it */
+ if (atomic_dec_and_test(&orig->uref)) {
+ pr_debug("Table destroy by resize %p\n", orig);
+ mtype_ahash_destroy(set, orig, false);
+ }
- return 0;
+out:
+#ifdef IP_SET_HASH_WITH_NETS
+ kfree(tmp);
+#endif
+ return ret;
+
+cleanup:
+ atomic_set(&orig->ref, 0);
+ atomic_dec(&orig->uref);
+ spin_unlock_bh(&set->lock);
+ mtype_ahash_destroy(set, t, false);
+ if (ret == -EAGAIN)
+ goto retry;
+ goto out;
}
/* Add an element to a hash and update the internal counters when succeeded,
- * otherwise report the proper error code. */
+ * otherwise report the proper error code.
+ */
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags)
@@ -627,17 +709,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct htable *t;
const struct mtype_elem *d = value;
struct mtype_elem *data;
- struct hbucket *n;
- int i, ret = 0;
- int j = AHASH_MAX(h) + 1;
+ struct hbucket *n, *old = ERR_PTR(-ENOENT);
+ int i, j = -1;
bool flag_exist = flags & IPSET_FLAG_EXIST;
+ bool deleted = false, forceadd = false, reuse = false;
u32 key, multi = 0;
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ if (h->elements >= h->maxelem) {
+ if (SET_WITH_TIMEOUT(set))
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+ if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ forceadd = true;
+ }
+
+ t = ipset_dereference_protected(h->table, set);
key = HKEY(value, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!n) {
+ if (forceadd) {
+ if (net_ratelimit())
+ pr_warn("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
+ } else if (h->elements >= h->maxelem) {
+ goto set_full;
+ }
+ old = NULL;
+ n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
+ GFP_ATOMIC);
+ if (!n)
+ return -ENOMEM;
+ n->size = AHASH_INIT_SIZE;
+ goto copy_elem;
+ }
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used)) {
+ /* Reuse first deleted entry */
+ if (j == -1) {
+ deleted = reuse = true;
+ j = i;
+ }
+ continue;
+ }
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi)) {
if (flag_exist ||
@@ -645,85 +759,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
ip_set_timeout_expired(ext_timeout(data, set)))) {
/* Just the extensions could be overwritten */
j = i;
- goto reuse_slot;
- } else {
- ret = -IPSET_ERR_EXIST;
- goto out;
+ goto overwrite_extensions;
}
+ return -IPSET_ERR_EXIST;
}
/* Reuse first timed out entry */
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(data, set)) &&
- j != AHASH_MAX(h) + 1)
+ j == -1) {
j = i;
+ reuse = true;
+ }
}
- if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) {
- /* Choosing the first entry in the array to replace */
- j = 0;
- goto reuse_slot;
- }
- if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
- /* FIXME: when set is full, we slow down here */
- mtype_expire(set, h, NLEN(set->family), set->dsize);
-
- if (h->elements >= h->maxelem) {
- if (net_ratelimit())
- pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- ret = -IPSET_ERR_HASH_FULL;
- goto out;
- }
-
-reuse_slot:
- if (j != AHASH_MAX(h) + 1) {
- /* Fill out reused slot */
+ if (reuse || forceadd) {
data = ahash_data(n, j, set->dsize);
+ if (!deleted) {
#ifdef IP_SET_HASH_WITH_NETS
- for (i = 0; i < IPSET_NET_COUNT; i++) {
- mtype_del_cidr(h, SCIDR(data->cidr, i),
- NLEN(set->family), i);
- mtype_add_cidr(h, SCIDR(d->cidr, i),
- NLEN(set->family), i);
- }
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_del_cidr(h,
+ NCIDR_PUT(DCIDR_GET(data->cidr, i)),
+ NLEN(set->family), i);
#endif
- ip_set_ext_destroy(set, data);
- } else {
- /* Use/create a new slot */
+ ip_set_ext_destroy(set, data);
+ h->elements--;
+ }
+ goto copy_data;
+ }
+ if (h->elements >= h->maxelem)
+ goto set_full;
+ /* Create a new slot */
+ if (n->pos >= n->size) {
TUNE_AHASH_MAX(h, multi);
- ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);
- if (ret != 0) {
- if (ret == -EAGAIN)
- mtype_data_next(&h->next, d);
- goto out;
+ if (n->size >= AHASH_MAX(h)) {
+ /* Trigger rehashing */
+ mtype_data_next(&h->next, d);
+ return -EAGAIN;
}
- data = ahash_data(n, n->pos++, set->dsize);
+ old = n;
+ n = kzalloc(sizeof(*n) +
+ (old->size + AHASH_INIT_SIZE) * set->dsize,
+ GFP_ATOMIC);
+ if (!n)
+ return -ENOMEM;
+ memcpy(n, old, sizeof(struct hbucket) +
+ old->size * set->dsize);
+ n->size = old->size + AHASH_INIT_SIZE;
+ }
+
+copy_elem:
+ j = n->pos++;
+ data = ahash_data(n, j, set->dsize);
+copy_data:
+ h->elements++;
#ifdef IP_SET_HASH_WITH_NETS
- for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family),
- i);
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
+ NLEN(set->family), i);
#endif
- h->elements++;
- }
memcpy(data, d, sizeof(struct mtype_elem));
+overwrite_extensions:
#ifdef IP_SET_HASH_WITH_NETS
mtype_data_set_flags(data, flags);
#endif
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
if (SET_WITH_COUNTER(set))
ip_set_init_counter(ext_counter(data, set), ext);
if (SET_WITH_COMMENT(set))
ip_set_init_comment(ext_comment(data, set), ext);
if (SET_WITH_SKBINFO(set))
ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
+ /* Must come last for the case when timed out entry is reused */
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
+ smp_mb__before_atomic();
+ set_bit(j, n->used);
+ if (old != ERR_PTR(-ENOENT)) {
+ rcu_assign_pointer(hbucket(t, key), n);
+ if (old)
+ kfree_rcu(old, rcu);
+ }
-out:
- rcu_read_unlock_bh();
- return ret;
+ return 0;
+set_full:
+ if (net_ratelimit())
+ pr_warn("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
}
-/* Delete an element from the hash: swap it with the last element
- * and free up space if possible.
+/* Delete an element from the hash and free up space if possible.
*/
static int
mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
@@ -734,55 +857,70 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n;
- int i, ret = -IPSET_ERR_EXIST;
-#ifdef IP_SET_HASH_WITH_NETS
- u8 j;
-#endif
+ int i, j, k, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
+ size_t dsize = set->dsize;
- rcu_read_lock_bh();
- t = rcu_dereference_bh(h->table);
+ t = ipset_dereference_protected(h->table, set);
key = HKEY(value, h->initval, t->htable_bits);
- n = hbucket(t, key);
- for (i = 0; i < n->pos; i++) {
- data = ahash_data(n, i, set->dsize);
+ n = __ipset_dereference_protected(hbucket(t, key), 1);
+ if (!n)
+ goto out;
+ for (i = 0, k = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used)) {
+ k++;
+ continue;
+ }
+ data = ahash_data(n, i, dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(data, set)))
goto out;
- if (i != n->pos - 1)
- /* Not last one */
- memcpy(data, ahash_data(n, n->pos - 1, set->dsize),
- set->dsize);
- n->pos--;
+ ret = 0;
+ clear_bit(i, n->used);
+ smp_mb__after_atomic();
+ if (i + 1 == n->pos)
+ n->pos--;
h->elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
- mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family),
- j);
+ mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
+ NLEN(set->family), j);
#endif
ip_set_ext_destroy(set, data);
- if (n->pos + AHASH_INIT_SIZE < n->size) {
- void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
- * set->dsize,
- GFP_ATOMIC);
- if (!tmp) {
- ret = 0;
+
+ for (; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ k++;
+ }
+ if (n->pos == 0 && k == 0) {
+ rcu_assign_pointer(hbucket(t, key), NULL);
+ kfree_rcu(n, rcu);
+ } else if (k >= AHASH_INIT_SIZE) {
+ struct hbucket *tmp = kzalloc(sizeof(*tmp) +
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
goto out;
+ tmp->size = n->size - AHASH_INIT_SIZE;
+ for (j = 0, k = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ memcpy(tmp->value + k * dsize, data, dsize);
+ set_bit(j, tmp->used);
+ k++;
}
- n->size -= AHASH_INIT_SIZE;
- memcpy(tmp, n->value, n->size * set->dsize);
- kfree(n->value);
- n->value = tmp;
+ tmp->pos = k;
+ rcu_assign_pointer(hbucket(t, key), tmp);
+ kfree_rcu(n, rcu);
}
- ret = 0;
goto out;
}
out:
- rcu_read_unlock_bh();
return ret;
}
@@ -801,7 +939,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
#ifdef IP_SET_HASH_WITH_NETS
/* Special test function which takes into account the different network
- * sizes added to the set */
+ * sizes added to the set
+ */
static int
mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
const struct ip_set_ext *ext,
@@ -824,16 +963,21 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
#if IPSET_NET_COUNT == 2
mtype_data_reset_elem(d, &orig);
- mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false);
+ mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
k++) {
- mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true);
+ mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
+ true);
#else
- mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]));
+ mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = rcu_dereference_bh(hbucket(t, key));
+ if (!n)
+ continue;
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
data = ahash_data(n, i, set->dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
@@ -871,13 +1015,13 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, ret = 0;
u32 key, multi = 0;
- rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
#ifdef IP_SET_HASH_WITH_NETS
/* If we test an IP address and not a network address,
- * try all possible network sizes */
+ * try all possible network sizes
+ */
for (i = 0; i < IPSET_NET_COUNT; i++)
- if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+ if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
break;
if (i == IPSET_NET_COUNT) {
ret = mtype_test_cidrs(set, d, ext, mext, flags);
@@ -886,8 +1030,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = hbucket(t, key);
+ n = rcu_dereference_bh(hbucket(t, key));
+ if (!n) {
+ ret = 0;
+ goto out;
+ }
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi) &&
!(SET_WITH_TIMEOUT(set) &&
@@ -897,7 +1047,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
}
out:
- rcu_read_unlock_bh();
return ret;
}
@@ -909,15 +1058,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
const struct htable *t;
struct nlattr *nested;
size_t memsize;
+ u8 htable_bits;
+ rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+ htable_bits = t->htable_bits;
+ rcu_read_unlock_bh();
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
- htonl(jhash_size(t->htable_bits))) ||
+ htonl(jhash_size(htable_bits))) ||
nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
goto nla_put_failure;
#ifdef IP_SET_HASH_WITH_NETMASK
@@ -941,32 +1094,63 @@ nla_put_failure:
return -EMSGSIZE;
}
+/* Make possible to run dumping parallel with resizing */
+static void
+mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+
+ if (start) {
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh_nfnl(h->table);
+ atomic_inc(&t->uref);
+ cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
+ rcu_read_unlock_bh();
+ } else if (cb->args[IPSET_CB_PRIVATE]) {
+ t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ /* Resizing didn't destroy the hash table */
+ pr_debug("Table destroy by dump: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+ cb->args[IPSET_CB_PRIVATE] = 0;
+ }
+}
+
/* Reply a LIST/SAVE request: dump the elements of the specified set */
static int
mtype_list(const struct ip_set *set,
struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct htype *h = set->data;
- const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+ const struct htable *t;
struct nlattr *atd, *nested;
const struct hbucket *n;
const struct mtype_elem *e;
u32 first = cb->args[IPSET_CB_ARG0];
/* We assume that one hash bucket fills into one page */
void *incomplete;
- int i;
+ int i, ret = 0;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
+
pr_debug("list hash set %s\n", set->name);
+ t = (const struct htable *)cb->args[IPSET_CB_PRIVATE];
+ /* Expire may replace a hbucket with another one */
+ rcu_read_lock();
for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
cb->args[IPSET_CB_ARG0]++) {
incomplete = skb_tail_pointer(skb);
- n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0]));
pr_debug("cb->arg bucket: %lu, t %p n %p\n",
cb->args[IPSET_CB_ARG0], t, n);
+ if (!n)
+ continue;
for (i = 0; i < n->pos; i++) {
+ if (!test_bit(i, n->used))
+ continue;
e = ahash_data(n, i, set->dsize);
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
@@ -977,9 +1161,10 @@ mtype_list(const struct ip_set *set,
if (!nested) {
if (cb->args[IPSET_CB_ARG0] == first) {
nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ goto nla_put_failure;
}
if (mtype_data_list(skb, e))
goto nla_put_failure;
@@ -992,7 +1177,7 @@ mtype_list(const struct ip_set *set,
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nlmsg_trim(skb, incomplete);
@@ -1000,20 +1185,24 @@ nla_put_failure:
pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n",
set->name);
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
+ } else {
+ ipset_nest_end(skb, atd);
}
- ipset_nest_end(skb, atd);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static int
IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt);
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
static int
IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+ enum ipset_adt adt, u32 *lineno, u32 flags,
+ bool retried);
static const struct ip_set_type_variant mtype_variant = {
.kadt = mtype_kadt,
@@ -1027,6 +1216,7 @@ static const struct ip_set_type_variant mtype_variant = {
.flush = mtype_flush,
.head = mtype_head,
.list = mtype_list,
+ .uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
};
@@ -1045,7 +1235,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
u8 netmask;
#endif
size_t hsize;
- struct HTYPE *h;
+ struct htype *h;
struct htable *t;
#ifndef IP_SET_PROTO_UNDEF
@@ -1064,12 +1254,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
-#ifdef IP_SET_HASH_WITH_MARKMASK
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
-#endif
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ /* Separated condition in order to avoid directive in argument list */
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
+ return -IPSET_ERR_PROTOCOL;
+#endif
if (tb[IPSET_ATTR_HASHSIZE]) {
hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
@@ -1092,7 +1284,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
if (tb[IPSET_ATTR_MARKMASK]) {
- markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+ markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
if (markmask == 0)
return -IPSET_ERR_INVALID_MARKMASK;
@@ -1137,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem)));
#ifndef IP_SET_PROTO_UNDEF
} else {
set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
}
#endif
if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -1165,3 +1359,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
return 0;
}
#endif /* IP_SET_EMIT_CREATE */
+
+#undef HKEY_DATALEN
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ip.c b/kernel/net/netfilter/ipset/ip_set_hash_ip.c
index 76959d79e..9d6bf19f7 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ip.c
@@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1,
return e1->ip == e2->ip;
}
-static inline bool
+static bool
hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
{
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
}
#define MTYPE hash_ip4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -109,20 +108,17 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, hosts;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -145,7 +141,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -162,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -196,10 +192,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -208,12 +204,9 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ip6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
@@ -247,22 +240,25 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ if (unlikely(!tb[IPSET_ATTR_IP]))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -301,7 +297,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -318,6 +315,7 @@ hash_ip_init(void)
static void __exit
hash_ip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
index 7abf9788c..a0695a2ab 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb,
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -76,10 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
next->ip = d->ip;
}
-#define MTYPE hash_ipmark4
-#define PF 4
-#define HOST_MASK 32
-#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
+#define MTYPE hash_ipmark4
+#define HOST_MASK 32
#include "ip_set_hash_gen.h"
static int
@@ -110,25 +108,22 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip, ip_to = 0;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
if (adt == IPSET_TEST ||
@@ -147,7 +142,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -160,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -191,10 +186,10 @@ hash_ipmark6_data_list(struct sk_buff *skb,
if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -204,18 +199,13 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ipmark6
-#define PF 6
#define HOST_MASK 128
-#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
-#define IP_SET_EMIT_CREATE
+#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
-
static int
hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -243,27 +233,30 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
if (ret)
return ret;
- e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
if (adt == IPSET_TEST) {
@@ -274,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
- return ret;
+ return 0;
}
static struct ip_set_type hash_ipmark_type __read_mostly = {
@@ -307,7 +298,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -324,6 +316,7 @@ hash_ipmark_init(void)
static void __exit
hash_ipmark_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipmark_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
index dcbcceb9a..9d84b3dff 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -83,10 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next,
next->port = d->port;
}
-#define MTYPE hash_ipport4
-#define PF 4
-#define HOST_MASK 32
-#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#define MTYPE hash_ipport4
+#define HOST_MASK 32
#include "ip_set_hash_gen.h"
static int
@@ -118,29 +116,23 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -148,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -171,7 +164,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -195,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
}
return ret;
@@ -231,10 +224,10 @@ hash_ipport6_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -245,15 +238,11 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_ipport6
-#define PF 6
#define HOST_MASK 128
-#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
-#define IP_SET_EMIT_CREATE
+#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
static int
@@ -285,31 +274,31 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -317,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
@@ -341,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -376,7 +366,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -393,6 +384,7 @@ hash_ipport_init(void)
static void __exit
hash_ipport_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipport_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
index 7ef93fc88..215b7b942 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -63,17 +63,17 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
static bool
hash_ipportip4_data_list(struct sk_buff *skb,
- const struct hash_ipportip4_elem *data)
+ const struct hash_ipportip4_elem *data)
{
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
/* Common functions */
#define MTYPE hash_ipportip4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -120,22 +119,19 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -143,10 +139,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -154,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -177,7 +171,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -201,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
}
return ret;
@@ -240,10 +234,10 @@ hash_ipportip6_data_list(struct sk_buff *skb,
nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -254,11 +248,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_ipportip6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -293,24 +285,27 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO)))
return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -318,10 +313,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -329,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
@@ -353,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -388,7 +381,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -405,6 +399,7 @@ hash_ipportip_init(void)
static void __exit
hash_ipportip_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipportip_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
index b6012ad92..9ca719625 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
}
#define MTYPE hash_ipportnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -142,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -174,23 +173,20 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -205,10 +201,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
e.cidr = cidr - 1;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -216,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -249,7 +244,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
} else if (tb[IPSET_ATTR_CIDR]) {
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > 32)
+ if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
ip_set_mask_from_to(ip, ip_to, cidr);
}
@@ -270,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (ip2_from + UINT_MAX == ip2_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -294,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = ip2_last + 1;
}
}
@@ -367,10 +363,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -381,11 +377,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_ipportnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -398,7 +392,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -429,27 +423,28 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
- tb[IPSET_ATTR_IP_TO] ||
- tb[IPSET_ATTR_CIDR]))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+ if (unlikely(tb[IPSET_ATTR_CIDR])) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (cidr != HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -466,10 +461,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip2, e.cidr + 1);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -477,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -508,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -547,7 +541,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -564,6 +559,7 @@ hash_ipportnet_init(void)
static void __exit
hash_ipportnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_ipportnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_mac.c b/kernel/net/netfilter/ipset/ip_set_hash_mac.c
index 65690b52a..f1e7d2c0f 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_mac.c
@@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1,
static inline bool
hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e)
{
- return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether);
+ if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
}
static inline void
@@ -62,7 +67,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next,
}
#define MTYPE hash_mac4
-#define PF 4
#define HOST_MASK 32
#define IP_SET_EMIT_CREATE
#define IP_SET_PROTO_UNDEF
@@ -85,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
return 0;
if (skb_mac_header(skb) < skb->head ||
- (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
@@ -103,22 +107,16 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- if (unlikely(!tb[IPSET_ATTR_ETHER] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!tb[IPSET_ATTR_ETHER]))
+ return -IPSET_ERR_PROTOCOL;
+
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]));
if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
return -IPSET_ERR_HASH_ELEM;
@@ -149,7 +147,8 @@ static struct ip_set_type hash_mac_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -166,6 +165,7 @@ hash_mac_init(void)
static void __exit
hash_mac_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_mac_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_net.c b/kernel/net/netfilter/ipset/ip_set_hash_net.c
index 6b3ac10ac..3e4bffdc1 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_net.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_net.c
@@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next,
}
#define MTYPE hash_net4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -121,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -147,21 +146,18 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, last;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -173,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -180,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
e.ip = htonl(ip & ip_set_hostmask(e.cidr));
ret = adtfn(set, &e, &ext, &ext, flags);
- return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
}
@@ -202,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip = last + 1;
}
return ret;
@@ -264,10 +261,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -277,11 +274,9 @@ hash_net6_data_next(struct hash_net4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_net6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -294,7 +289,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -318,36 +313,34 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
-
- if (!e.cidr || e.cidr > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip, e.cidr);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -383,7 +376,8 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -400,6 +394,7 @@ hash_net_init(void)
static void __exit
hash_net_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_net_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
index 380ef5148..43d8c9896 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -13,7 +13,6 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/random.h>
-#include <linux/rbtree.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlink.h>
@@ -37,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,iface");
-/* Interface name rbtree */
-
-struct iface_node {
- struct rb_node node;
- char iface[IFNAMSIZ];
-};
-
-#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface)
-
-static void
-rbtree_destroy(struct rb_root *root)
-{
- struct iface_node *node, *next;
-
- rbtree_postorder_for_each_entry_safe(node, next, root, node)
- kfree(node);
-
- *root = RB_ROOT;
-}
-
-static int
-iface_test(struct rb_root *root, const char **iface)
-{
- struct rb_node *n = root->rb_node;
-
- while (n) {
- const char *d = iface_data(n);
- int res = strcmp(*iface, d);
-
- if (res < 0)
- n = n->rb_left;
- else if (res > 0)
- n = n->rb_right;
- else {
- *iface = d;
- return 1;
- }
- }
- return 0;
-}
-
-static int
-iface_add(struct rb_root *root, const char **iface)
-{
- struct rb_node **n = &(root->rb_node), *p = NULL;
- struct iface_node *d;
-
- while (*n) {
- char *ifname = iface_data(*n);
- int res = strcmp(*iface, ifname);
-
- p = *n;
- if (res < 0)
- n = &((*n)->rb_left);
- else if (res > 0)
- n = &((*n)->rb_right);
- else {
- *iface = ifname;
- return 0;
- }
- }
-
- d = kzalloc(sizeof(*d), GFP_ATOMIC);
- if (!d)
- return -ENOMEM;
- strcpy(d->iface, *iface);
-
- rb_link_node(&d->node, p, n);
- rb_insert_color(&d->node, root);
-
- *iface = d->iface;
- return 0;
-}
-
/* Type specific function prefix */
#define HTYPE hash_netiface
#define IP_SET_HASH_WITH_NETS
-#define IP_SET_HASH_WITH_RBTREE
#define IP_SET_HASH_WITH_MULTI
#define IP_SET_HASH_WITH_NET0
-#define STREQ(a, b) (strcmp(a, b) == 0)
+#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ)
/* IPv4 variant */
@@ -137,7 +61,7 @@ struct hash_netiface4_elem {
u8 cidr;
u8 nomatch;
u8 elem;
- const char *iface;
+ char iface[IFNAMSIZ];
};
/* Common functions */
@@ -151,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- ip1->iface == ip2->iface;
+ strcmp(ip1->iface, ip2->iface) == 0;
}
static inline int
@@ -193,10 +117,10 @@ hash_netiface4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -207,7 +131,6 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
}
#define MTYPE hash_netiface4
-#define PF 4
#define HOST_MASK 32
#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
#include "ip_set_hash_gen.h"
@@ -220,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb)
return dev ? dev->name : NULL;
}
-static const char *get_phyoutdev_name(const struct sk_buff *skb)
+static const char *get_physoutdev_name(const struct sk_buff *skb)
{
struct net_device *dev = nf_bridge_get_physoutdev(skb);
@@ -236,11 +159,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
.elem = 1,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- int ret;
if (e.cidr == 0)
return -EINVAL;
@@ -250,35 +172,25 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
e.ip &= ip_set_netmask(e.cidr);
-#define IFACE(dir) (par->dir ? par->dir->name : NULL)
+#define IFACE(dir) (par->dir ? par->dir->name : "")
#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- e.iface = SRCDIR ? get_physindev_name(skb) :
- get_phyoutdev_name(skb);
+ const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ get_physoutdev_name(skb);
- if (!e.iface)
+ if (!eiface)
return -EINVAL;
+ STRLCPY(e.iface, eiface);
e.physdev = 1;
-#else
- e.iface = NULL;
#endif
- } else
- e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ } else {
+ STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ }
- if (!e.iface)
+ if (strlen(e.iface) == 0)
return -EINVAL;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
-
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -291,25 +203,21 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
- char iface[IFNAMSIZ];
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -318,21 +226,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
-
- strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- e.iface = iface;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
+ nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_PHYSDEV)
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
@@ -353,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -365,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip = last + 1;
}
return ret;
@@ -388,7 +287,7 @@ struct hash_netiface6_elem {
u8 cidr;
u8 nomatch;
u8 elem;
- const char *iface;
+ char iface[IFNAMSIZ];
};
/* Common functions */
@@ -402,7 +301,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- ip1->iface == ip2->iface;
+ strcmp(ip1->iface, ip2->iface) == 0;
}
static inline int
@@ -444,10 +343,10 @@ hash_netiface6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -457,12 +356,9 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
-#undef HKEY_DATALEN
#define MTYPE hash_netiface6
-#define PF 6
#define HOST_MASK 128
#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
#define IP_SET_EMIT_CREATE
@@ -476,11 +372,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
.elem = 1,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- int ret;
if (e.cidr == 0)
return -EINVAL;
@@ -492,85 +387,64 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- e.iface = SRCDIR ? get_physindev_name(skb) :
- get_phyoutdev_name(skb);
- if (!e.iface)
- return -EINVAL;
+ const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ get_physoutdev_name(skb);
+ if (!eiface)
+ return -EINVAL;
+ STRLCPY(e.iface, eiface);
e.physdev = 1;
-#else
- e.iface = NULL;
#endif
- } else
- e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ } else {
+ STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ }
- if (!e.iface)
+ if (strlen(e.iface) == 0)
return -EINVAL;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- char iface[IFNAMSIZ];
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (e.cidr > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
ip6_netmask(&e.ip, e.cidr);
- strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- e.iface = iface;
- ret = iface_test(&h->rbtree, &e.iface);
- if (adt == IPSET_ADD) {
- if (!ret) {
- ret = iface_add(&h->rbtree, &e.iface);
- if (ret)
- return ret;
- }
- } else if (!ret)
- return ret;
+ nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_PHYSDEV)
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
@@ -613,7 +487,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -630,6 +505,7 @@ hash_netiface_init(void)
static void __exit
hash_netiface_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netiface_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
index ea8772afb..a93dfebff 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -57,8 +57,8 @@ struct hash_netnet4_elem {
static inline bool
hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
- const struct hash_netnet4_elem *ip2,
- u32 *multi)
+ const struct hash_netnet4_elem *ip2,
+ u32 *multi)
{
return ip1->ipcmp == ip2->ipcmp &&
ip1->ccmp == ip2->ccmp;
@@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
static inline void
hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
- struct hash_netnet4_elem *orig)
+ struct hash_netnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
static bool
hash_netnet4_data_list(struct sk_buff *skb,
- const struct hash_netnet4_elem *data)
+ const struct hash_netnet4_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -122,28 +122,34 @@ nla_put_failure:
static inline void
hash_netnet4_data_next(struct hash_netnet4_elem *next,
- const struct hash_netnet4_elem *d)
+ const struct hash_netnet4_elem *d)
{
next->ipcmp = d->ipcmp;
}
#define MTYPE hash_netnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
+static void
+hash_netnet4_init(struct hash_netnet4_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -157,7 +163,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -165,45 +171,43 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
- u8 cidr, cidr2;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netnet4_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[0] = cidr;
}
if (tb[IPSET_ATTR_CIDR2]) {
- cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!cidr2 || cidr2 > HOST_MASK)
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[1] = cidr2;
}
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -226,8 +230,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (unlikely(ip + UINT_MAX == ip_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+ }
ip2_to = ip2_from;
if (tb[IPSET_ATTR_IP2_TO]) {
@@ -238,28 +243,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (unlikely(ip2_from + UINT_MAX == ip2_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+ }
if (retried)
ip = ntohl(h->next.ip[0]);
while (!after(ip, ip_to)) {
e.ip[0] = htonl(ip);
- last = ip_set_range_to_cidr(ip, ip_to, &cidr);
- e.cidr[0] = cidr;
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
ip2 = (retried &&
ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
: ip2_from;
while (!after(ip2, ip2_to)) {
e.ip[1] = htonl(ip2);
- last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2);
- e.cidr[1] = cidr2;
+ last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = last2 + 1;
}
ip = last + 1;
@@ -283,8 +287,8 @@ struct hash_netnet6_elem {
static inline bool
hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
- const struct hash_netnet6_elem *ip2,
- u32 *multi)
+ const struct hash_netnet6_elem *ip2,
+ u32 *multi)
{
return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -311,7 +315,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
static inline void
hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
- struct hash_netnet6_elem *orig)
+ struct hash_netnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -330,7 +334,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
static bool
hash_netnet6_data_list(struct sk_buff *skb,
- const struct hash_netnet6_elem *data)
+ const struct hash_netnet6_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -349,34 +353,39 @@ nla_put_failure:
static inline void
hash_netnet6_data_next(struct hash_netnet4_elem *next,
- const struct hash_netnet6_elem *d)
+ const struct hash_netnet6_elem *d)
{
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
+static void
+hash_netnet6_init(struct hash_netnet6_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
- e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+ e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
@@ -388,50 +397,53 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netnet6_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
- ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- if (tb[IPSET_ATTR_CIDR2])
+ if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-
- if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
- e.cidr[1] > HOST_MASK)
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -470,7 +482,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -487,6 +500,7 @@ hash_netnet_init(void)
static void __exit
hash_netnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netport.c b/kernel/net/netfilter/ipset/ip_set_hash_netport.c
index c0ddb58d1..731813e0f 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netport.c
@@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next,
}
#define MTYPE hash_netport4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
@@ -137,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -167,23 +166,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -194,10 +190,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
e.cidr = cidr - 1;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -205,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
@@ -215,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -240,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+ }
if (retried)
ip = ntohl(h->next.ip);
@@ -257,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
ip = last + 1;
}
@@ -326,10 +322,10 @@ hash_netport6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
@@ -340,11 +336,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next,
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netport6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
@@ -357,7 +351,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport6_elem e = {
- .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -387,25 +381,22 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
u8 cidr;
int ret;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -417,10 +408,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
}
ip6_netmask(&e.ip, e.cidr + 1);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -428,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -459,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -495,7 +485,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -512,6 +503,7 @@ hash_netport_init(void)
static void __exit
hash_netport_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netport_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
index bfaa94c7b..9a14c2378 100644
--- a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -54,7 +54,7 @@ struct hash_netportnet4_elem {
u16 ccmp;
};
u16 padding;
- u8 nomatch:1;
+ u8 nomatch;
u8 proto;
};
@@ -62,8 +62,8 @@ struct hash_netportnet4_elem {
static inline bool
hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
- const struct hash_netportnet4_elem *ip2,
- u32 *multi)
+ const struct hash_netportnet4_elem *ip2,
+ u32 *multi)
{
return ip1->ipcmp == ip2->ipcmp &&
ip1->ccmp == ip2->ccmp &&
@@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
static inline void
hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
- struct hash_netportnet4_elem *orig)
+ struct hash_netportnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
static bool
hash_netportnet4_data_list(struct sk_buff *skb,
- const struct hash_netportnet4_elem *data)
+ const struct hash_netportnet4_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -124,37 +124,43 @@ hash_netportnet4_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
- const struct hash_netportnet4_elem *d)
+ const struct hash_netportnet4_elem *d)
{
next->ipcmp = d->ipcmp;
next->port = d->port;
}
#define MTYPE hash_netportnet4
-#define PF 4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
+static void
+hash_netportnet4_init(struct hash_netportnet4_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
@@ -172,7 +178,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -181,49 +187,43 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
bool with_ports = false;
- u8 cidr, cidr2;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netportnet4_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
- ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[0] = cidr;
}
if (tb[IPSET_ATTR_CIDR2]) {
- cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
- if (!cidr || cidr > HOST_MASK)
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- e.cidr[1] = cidr;
}
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -231,14 +231,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMP))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -262,8 +264,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (unlikely(ip + UINT_MAX == ip_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+ }
port_to = port = ntohs(e.port);
if (tb[IPSET_ATTR_PORT_TO]) {
@@ -281,16 +284,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (unlikely(ip2_from + UINT_MAX == ip2_to))
return -IPSET_ERR_HASH_RANGE;
- } else
+ } else {
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+ }
if (retried)
ip = ntohl(h->next.ip[0]);
while (!after(ip, ip_to)) {
e.ip[0] = htonl(ip);
- ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr);
- e.cidr[0] = cidr;
+ ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
@@ -301,13 +304,12 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
while (!after(ip2, ip2_to)) {
e.ip[1] = htonl(ip2);
ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
- &cidr2);
- e.cidr[1] = cidr2;
+ &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
ip2 = ip2_last + 1;
}
}
@@ -326,7 +328,7 @@ struct hash_netportnet6_elem {
u16 ccmp;
};
u16 padding;
- u8 nomatch:1;
+ u8 nomatch;
u8 proto;
};
@@ -334,8 +336,8 @@ struct hash_netportnet6_elem {
static inline bool
hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
- const struct hash_netportnet6_elem *ip2,
- u32 *multi)
+ const struct hash_netportnet6_elem *ip2,
+ u32 *multi)
{
return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -364,7 +366,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
static inline void
hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
- struct hash_netportnet6_elem *orig)
+ struct hash_netportnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
@@ -384,7 +386,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
static bool
hash_netportnet6_data_list(struct sk_buff *skb,
- const struct hash_netportnet6_elem *data)
+ const struct hash_netportnet6_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
@@ -397,41 +399,46 @@ hash_netportnet6_data_list(struct sk_buff *skb,
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
- return 0;
+ return false;
nla_put_failure:
- return 1;
+ return true;
}
static inline void
hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
- const struct hash_netportnet6_elem *d)
+ const struct hash_netportnet6_elem *d)
{
next->port = d->port;
}
#undef MTYPE
-#undef PF
#undef HOST_MASK
#define MTYPE hash_netportnet6
-#define PF 6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
+static void
+hash_netportnet6_init(struct hash_netportnet6_elem *e)
+{
+ e->cidr[0] = HOST_MASK;
+ e->cidr[1] = HOST_MASK;
+}
+
static int
hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
- const struct xt_action_param *par,
- enum ipset_adt adt, struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
- e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
@@ -449,7 +456,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
static int
hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
- enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
@@ -459,47 +466,46 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
bool with_ports = false;
int ret;
- e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ hash_netportnet6_init(&e);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
- if (tb[IPSET_ATTR_LINENO])
- *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]);
+ if (ret)
+ return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
- ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
- ip_set_get_extensions(set, tb, &ext);
+ ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (tb[IPSET_ATTR_CIDR])
+ if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
- if (tb[IPSET_ATTR_CIDR2])
+ if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
-
- if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
- e.cidr[1] > HOST_MASK))
- return -IPSET_ERR_INVALID_CIDR;
+ if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
- if (tb[IPSET_ATTR_PORT])
- e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
- else
- return -IPSET_ERR_PROTOCOL;
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
@@ -507,14 +513,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
- } else
+ } else {
return -IPSET_ERR_MISSING_PROTO;
+ }
if (!(with_ports || e.proto == IPPROTO_ICMPV6))
e.port = 0;
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
@@ -538,8 +546,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret && !ip_set_eexist(ret, flags))
return ret;
- else
- ret = 0;
+
+ ret = 0;
}
return ret;
}
@@ -577,7 +585,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -594,6 +603,7 @@ hash_netportnet_init(void)
static void __exit
hash_netportnet_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&hash_netportnet_type);
}
diff --git a/kernel/net/netfilter/ipset/ip_set_list_set.c b/kernel/net/netfilter/ipset/ip_set_list_set.c
index f8f682806..bbede95c9 100644
--- a/kernel/net/netfilter/ipset/ip_set_list_set.c
+++ b/kernel/net/netfilter/ipset/ip_set_list_set.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/ip.h>
+#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
@@ -27,8 +28,10 @@ MODULE_ALIAS("ip_set_list:set");
/* Member elements */
struct set_elem {
+ struct rcu_head rcu;
+ struct list_head list;
ip_set_id_t id;
-};
+} __aligned(__alignof__(u64));
struct set_adt_elem {
ip_set_id_t id;
@@ -41,12 +44,9 @@ struct list_set {
u32 size; /* size of set list array */
struct timer_list gc; /* garbage collection */
struct net *net; /* namespace */
- struct set_elem members[0]; /* the set members */
+ struct list_head members; /* the set members */
};
-#define list_set_elem(set, map, id) \
- (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)
-
static int
list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i, cmdflags = opt->cmdflags;
+ u32 cmdflags = opt->cmdflags;
int ret;
/* Don't lookup sub-counters at all */
opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
{
struct list_set *map = set->data;
struct set_elem *e;
- u32 i;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
+ list_for_each_entry(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret = -EINVAL;
+ rcu_read_lock();
switch (adt) {
case IPSET_TEST:
- return list_set_ktest(set, skb, par, opt, &ext);
+ ret = list_set_ktest(set, skb, par, opt, &ext);
+ break;
case IPSET_ADD:
- return list_set_kadd(set, skb, par, opt, &ext);
+ ret = list_set_kadd(set, skb, par, opt, &ext);
+ break;
case IPSET_DEL:
- return list_set_kdel(set, skb, par, opt, &ext);
+ ret = list_set_kdel(set, skb, par, opt, &ext);
+ break;
default:
break;
}
- return -EINVAL;
-}
-
-static bool
-id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
-{
- const struct list_set *map = set->data;
- const struct set_elem *e;
-
- if (i >= map->size)
- return 0;
+ rcu_read_unlock();
- e = list_set_elem(set, map, i);
- return !!(e->id == id &&
- !(SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set))));
+ return ret;
}
-static int
-list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
- const struct ip_set_ext *ext)
-{
- struct list_set *map = set->data;
- struct set_elem *e = list_set_elem(set, map, i);
+/* Userspace interfaces: we are protected by the nfnl mutex */
- if (e->id != IPSET_INVALID_ID) {
- if (i == map->size - 1) {
- /* Last element replaced: e.g. add new,before,last */
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- } else {
- struct set_elem *x = list_set_elem(set, map,
- map->size - 1);
-
- /* Last element pushed off */
- if (x->id != IPSET_INVALID_ID) {
- ip_set_put_byindex(map->net, x->id);
- ip_set_ext_destroy(set, x);
- }
- memmove(list_set_elem(set, map, i + 1), e,
- set->dsize * (map->size - (i + 1)));
- /* Extensions must be initialized to zero */
- memset(e, 0, set->dsize);
- }
- }
-
- e->id = d->id;
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
- if (SET_WITH_COUNTER(set))
- ip_set_init_counter(ext_counter(e, set), ext);
- if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
- if (SET_WITH_SKBINFO(set))
- ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
- return 0;
-}
-
-static int
-list_set_del(struct ip_set *set, u32 i)
+static void
+__list_set_del(struct ip_set *set, struct set_elem *e)
{
struct list_set *map = set->data;
- struct set_elem *e = list_set_elem(set, map, i);
ip_set_put_byindex(map->net, e->id);
+ /* We may call it, because we don't have a to be destroyed
+ * extension which is used by the kernel.
+ */
ip_set_ext_destroy(set, e);
+ kfree_rcu(e, rcu);
+}
- if (i < map->size - 1)
- memmove(e, list_set_elem(set, map, i + 1),
- set->dsize * (map->size - (i + 1)));
+static inline void
+list_set_del(struct ip_set *set, struct set_elem *e)
+{
+ list_del_rcu(&e->list);
+ __list_set_del(set, e);
+}
- /* Last element */
- e = list_set_elem(set, map, map->size - 1);
- e->id = IPSET_INVALID_ID;
- return 0;
+static inline void
+list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+{
+ list_replace_rcu(&old->list, &e->list);
+ __list_set_del(set, old);
}
static void
set_cleanup_entries(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e;
- u32 i = 0;
+ struct set_elem *e, *n;
- while (i < map->size) {
- e = list_set_elem(set, map, i);
- if (e->id != IPSET_INVALID_ID &&
- ip_set_timeout_expired(ext_timeout(e, set)))
- list_set_del(set, i);
- /* Check element moved to position i in next loop */
- else
- i++;
- }
+ list_for_each_entry_safe(e, n, &map->members, list)
+ if (ip_set_timeout_expired(ext_timeout(e, set)))
+ list_set_del(set, e);
}
static int
@@ -250,31 +194,46 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
- u32 i;
+ struct set_elem *e, *next, *prev = NULL;
int ret;
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return 0;
- else if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
- else if (e->id != d->id)
+ else if (e->id != d->id) {
+ prev = e;
continue;
+ }
- if (d->before == 0)
- return 1;
- else if (d->before > 0)
- ret = id_eq(set, i + 1, d->refid);
- else
- ret = i > 0 && id_eq(set, i - 1, d->refid);
+ if (d->before == 0) {
+ ret = 1;
+ } else if (d->before > 0) {
+ next = list_next_entry(e, list);
+ ret = !list_is_last(&e->list, &map->members) &&
+ next->id == d->refid;
+ } else {
+ ret = prev && prev->id == d->refid;
+ }
return ret;
}
return 0;
}
+static void
+list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
+ struct set_elem *e)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
+ /* Update timeout last */
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+}
static int
list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
@@ -282,60 +241,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
+ struct set_elem *e, *n, *prev, *next;
bool flag_exist = flags & IPSET_FLAG_EXIST;
- u32 i, ret = 0;
if (SET_WITH_TIMEOUT(set))
set_cleanup_entries(set);
- /* Check already added element */
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- goto insert;
- else if (e->id != d->id)
+ /* Find where to add the new entry */
+ n = prev = next = NULL;
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
-
- if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
- (d->before < 0 &&
- (i == 0 || !id_eq(set, i - 1, d->refid))))
- /* Before/after doesn't match */
+ else if (d->id == e->id)
+ n = e;
+ else if (d->before == 0 || e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ next = e;
+ else
+ prev = e;
+ }
+ /* Re-add already existing element */
+ if (n) {
+ if ((d->before > 0 && !next) ||
+ (d->before < 0 && !prev))
return -IPSET_ERR_REF_EXIST;
if (!flag_exist)
- /* Can't re-add */
return -IPSET_ERR_EXIST;
/* Update extensions */
- ip_set_ext_destroy(set, e);
+ ip_set_ext_destroy(set, n);
+ list_set_init_extensions(set, ext, n);
- if (SET_WITH_TIMEOUT(set))
- ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
- if (SET_WITH_COUNTER(set))
- ip_set_init_counter(ext_counter(e, set), ext);
- if (SET_WITH_COMMENT(set))
- ip_set_init_comment(ext_comment(e, set), ext);
- if (SET_WITH_SKBINFO(set))
- ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
/* Set is already added to the list */
ip_set_put_byindex(map->net, d->id);
return 0;
}
-insert:
- ret = -IPSET_ERR_LIST_FULL;
- for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
- : list_set_add(set, i, d, ext);
- else if (e->id != d->refid)
- continue;
- else if (d->before > 0)
- ret = list_set_add(set, i, d, ext);
- else if (i + 1 < map->size)
- ret = list_set_add(set, i + 1, d, ext);
+ /* Add new entry */
+ if (d->before == 0) {
+ /* Append */
+ n = list_empty(&map->members) ? NULL :
+ list_last_entry(&map->members, struct set_elem, list);
+ } else if (d->before > 0) {
+ /* Insert after next element */
+ if (!list_is_last(&next->list, &map->members))
+ n = list_next_entry(next, list);
+ } else {
+ /* Insert before prev element */
+ if (prev->list.prev != &map->members)
+ n = list_prev_entry(prev, list);
}
+ /* Can we replace a timed out entry? */
+ if (n &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(n, set))))
+ n = NULL;
+
+ e = kzalloc(set->dsize, GFP_ATOMIC);
+ if (!e)
+ return -ENOMEM;
+ e->id = d->id;
+ INIT_LIST_HEAD(&e->list);
+ list_set_init_extensions(set, ext, e);
+ if (n)
+ list_set_replace(set, e, n);
+ else if (next)
+ list_add_tail_rcu(&e->list, &next->list);
+ else if (prev)
+ list_add_rcu(&e->list, &prev->list);
+ else
+ list_add_tail_rcu(&e->list, &map->members);
- return ret;
+ return 0;
}
static int
@@ -344,32 +321,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e;
- u32 i;
-
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- return d->before != 0 ? -IPSET_ERR_REF_EXIST
- : -IPSET_ERR_EXIST;
- else if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ struct set_elem *e, *next, *prev = NULL;
+
+ list_for_each_entry(e, &map->members, list) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
continue;
- else if (e->id != d->id)
+ else if (e->id != d->id) {
+ prev = e;
continue;
+ }
- if (d->before == 0)
- return list_set_del(set, i);
- else if (d->before > 0) {
- if (!id_eq(set, i + 1, d->refid))
+ if (d->before > 0) {
+ next = list_next_entry(e, list);
+ if (list_is_last(&e->list, &map->members) ||
+ next->id != d->refid)
return -IPSET_ERR_REF_EXIST;
- return list_set_del(set, i);
- } else if (i == 0 || !id_eq(set, i - 1, d->refid))
- return -IPSET_ERR_REF_EXIST;
- else
- return list_set_del(set, i);
+ } else if (d->before < 0) {
+ if (!prev || prev->id != d->refid)
+ return -IPSET_ERR_REF_EXIST;
+ }
+ list_set_del(set, e);
+ return 0;
}
- return -IPSET_ERR_EXIST;
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST;
}
static int
@@ -383,19 +358,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
struct ip_set *s;
int ret = 0;
- if (unlikely(!tb[IPSET_ATTR_NAME] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
- return -IPSET_ERR_PROTOCOL;
-
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -410,6 +379,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
e.before = f & IPSET_FLAG_BEFORE;
}
@@ -447,27 +417,26 @@ static void
list_set_flush(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e;
- u32 i;
-
- for (i = 0; i < map->size; i++) {
- e = list_set_elem(set, map, i);
- if (e->id != IPSET_INVALID_ID) {
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- e->id = IPSET_INVALID_ID;
- }
- }
+ struct set_elem *e, *n;
+
+ list_for_each_entry_safe(e, n, &map->members, list)
+ list_set_del(set, e);
}
static void
list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
+ struct set_elem *e, *n;
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
- list_set_flush(set);
+ list_for_each_entry_safe(e, n, &map->members, list) {
+ list_del(&e->list);
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ kfree(e);
+ }
kfree(map);
set->data = NULL;
@@ -478,6 +447,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
{
const struct list_set *map = set->data;
struct nlattr *nested;
+ struct set_elem *e;
+ u32 n = 0;
+
+ list_for_each_entry(e, &map->members, list)
+ n++;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
@@ -485,7 +459,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + map->size * set->dsize)))
+ htonl(sizeof(*map) + n * set->dsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -502,18 +476,22 @@ list_set_list(const struct ip_set *set,
{
const struct list_set *map = set->data;
struct nlattr *atd, *nested;
- u32 i, first = cb->args[IPSET_CB_ARG0];
- const struct set_elem *e;
+ u32 i = 0, first = cb->args[IPSET_CB_ARG0];
+ struct set_elem *e;
+ int ret = 0;
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
- for (; cb->args[IPSET_CB_ARG0] < map->size;
- cb->args[IPSET_CB_ARG0]++) {
- i = cb->args[IPSET_CB_ARG0];
- e = list_set_elem(set, map, i);
- if (e->id == IPSET_INVALID_ID)
- goto finish;
+ list_for_each_entry(e, &map->members, list) {
+ if (i == first)
+ break;
+ i++;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_from(e, &map->members, list) {
+ i++;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -521,9 +499,10 @@ list_set_list(const struct ip_set *set,
if (!nested) {
if (i == first) {
nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ goto nla_put_failure;
}
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(map->net, e->id)))
@@ -532,20 +511,23 @@ list_set_list(const struct ip_set *set,
goto nla_put_failure;
ipset_nest_end(skb, nested);
}
-finish:
+
ipset_nest_end(skb, atd);
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
- return 0;
+ goto out;
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
cb->args[IPSET_CB_ARG0] = 0;
- return -EMSGSIZE;
+ ret = -EMSGSIZE;
}
+ cb->args[IPSET_CB_ARG0] = i - 1;
ipset_nest_end(skb, atd);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static bool
@@ -577,12 +559,12 @@ static const struct ip_set_type_variant set_variant = {
static void
list_set_gc(unsigned long ul_set)
{
- struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set *set = (struct ip_set *)ul_set;
struct list_set *map = set->data;
- write_lock_bh(&set->lock);
+ spin_lock_bh(&set->lock);
set_cleanup_entries(set);
- write_unlock_bh(&set->lock);
+ spin_unlock_bh(&set->lock);
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -594,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
struct list_set *map = set->data;
init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
+ map->gc.data = (unsigned long)set;
map->gc.function = gc;
map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
add_timer(&map->gc);
@@ -606,24 +588,16 @@ static bool
init_list_set(struct net *net, struct ip_set *set, u32 size)
{
struct list_set *map;
- struct set_elem *e;
- u32 i;
- map = kzalloc(sizeof(*map) +
- min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize,
- GFP_KERNEL);
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
if (!map)
return false;
map->size = size;
map->net = net;
+ INIT_LIST_HEAD(&map->members);
set->data = map;
- for (i = 0; i < size; i++) {
- e = list_set_elem(set, map, i);
- e->id = IPSET_INVALID_ID;
- }
-
return true;
}
@@ -644,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
size = IP_SET_LIST_MIN_SIZE;
set->variant = &set_variant;
- set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
+ __alignof__(struct set_elem));
if (!init_list_set(net, set, size))
return -ENOMEM;
if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -678,7 +653,8 @@ static struct ip_set_type list_set_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
- [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
@@ -695,6 +671,7 @@ list_set_init(void)
static void __exit
list_set_fini(void)
{
+ rcu_barrier();
ip_set_type_unregister(&list_set_type);
}
diff --git a/kernel/net/netfilter/ipset/pfxlen.c b/kernel/net/netfilter/ipset/pfxlen.c
index 04d15fdc9..1c8a42c10 100644
--- a/kernel/net/netfilter/ipset/pfxlen.c
+++ b/kernel/net/netfilter/ipset/pfxlen.c
@@ -1,9 +1,7 @@
#include <linux/export.h>
#include <linux/netfilter/ipset/pfxlen.h>
-/*
- * Prefixlen maps for fast conversions, by Jan Engelhardt.
- */
+/* Prefixlen maps for fast conversions, by Jan Engelhardt. */
#define E(a, b, c, d) \
{.ip6 = { \
@@ -11,8 +9,7 @@
htonl(c), htonl(d), \
} }
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
* just use prefixlen_netmask_map[prefixlength].ip.
*/
const union nf_inet_addr ip_set_netmask_map[] = {
@@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = {
EXPORT_SYMBOL_GPL(ip_set_netmask_map);
#undef E
-#define E(a, b, c, d) \
- {.ip6 = { (__force __be32) a, (__force __be32) b, \
- (__force __be32) c, (__force __be32) d, \
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32)a, (__force __be32)b, \
+ (__force __be32)c, (__force __be32)d, \
} }
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
* just use prefixlen_hostmask_map[prefixlength].ip.
*/
const union nf_inet_addr ip_set_hostmask_map[] = {
diff --git a/kernel/net/netfilter/ipvs/Kconfig b/kernel/net/netfilter/ipvs/Kconfig
index 3b6929dec..b32fb0dbe 100644
--- a/kernel/net/netfilter/ipvs/Kconfig
+++ b/kernel/net/netfilter/ipvs/Kconfig
@@ -162,6 +162,17 @@ config IP_VS_FO
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_OVF
+ tristate "weighted overflow scheduling"
+ ---help---
+ The weighted overflow scheduling algorithm directs network
+ connections to the server with the highest weight that is
+ currently available and overflows to the next when active
+ connections exceed the node's weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
config IP_VS_LBLC
tristate "locality-based least-connection scheduling"
---help---
diff --git a/kernel/net/netfilter/ipvs/Makefile b/kernel/net/netfilter/ipvs/Makefile
index 38b2723b2..67f3f4389 100644
--- a/kernel/net/netfilter/ipvs/Makefile
+++ b/kernel/net/netfilter/ipvs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o
+obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o
obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
diff --git a/kernel/net/netfilter/ipvs/ip_vs_app.c b/kernel/net/netfilter/ipvs/ip_vs_app.c
index dfd7b65b3..0328f7250 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_app.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_app.c
@@ -75,7 +75,7 @@ static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto,
__u16 port)
{
struct ip_vs_protocol *pp;
@@ -107,7 +107,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
}
}
- ret = pp->register_app(net, inc);
+ ret = pp->register_app(ipvs, inc);
if (ret)
goto out;
@@ -127,7 +127,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -135,7 +135,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(net, inc);
+ pp->unregister_app(ipvs, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
@@ -175,14 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto,
__u16 port)
{
int result;
mutex_lock(&__ip_vs_app_mutex);
- result = ip_vs_app_inc_new(net, app, proto, port);
+ result = ip_vs_app_inc_new(ipvs, app, proto, port);
mutex_unlock(&__ip_vs_app_mutex);
@@ -191,15 +191,11 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
/* Register application for netns */
-struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *a;
int err = 0;
- if (!ipvs)
- return ERR_PTR(-ENOENT);
-
mutex_lock(&__ip_vs_app_mutex);
list_for_each_entry(a, &ipvs->app_list, a_list) {
@@ -230,21 +226,17 @@ out_unlock:
* We are sure there are no app incarnations attached to services
* Caller should use synchronize_rcu() or rcu_barrier()
*/
-void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *a, *anxt, *inc, *nxt;
- if (!ipvs)
- return;
-
mutex_lock(&__ip_vs_app_mutex);
list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
if (app && strcmp(app->name, a->name))
continue;
list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
- ip_vs_app_inc_release(net, inc);
+ ip_vs_app_inc_release(ipvs, inc);
}
list_del(&a->a_list);
@@ -611,17 +603,19 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __net_init ip_vs_app_net_init(struct net *net)
+int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
INIT_LIST_HEAD(&ipvs->app_list);
proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
return 0;
}
-void __net_exit ip_vs_app_net_cleanup(struct net *net)
+void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
- unregister_ip_vs_app(net, NULL /* all */);
+ struct net *net = ipvs->net;
+
+ unregister_ip_vs_app(ipvs, NULL /* all */);
remove_proc_entry("ip_vs_app", net->proc_net);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_conn.c b/kernel/net/netfilter/ipvs/ip_vs_conn.c
index b0f7b626b..85ca189bd 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_conn.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_conn.c
@@ -108,7 +108,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
+static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr,
__be16 port)
{
@@ -116,11 +116,11 @@ static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int pro
if (af == AF_INET6)
return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
(__force u32)port, proto, ip_vs_conn_rnd) ^
- ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+ ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
#endif
return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
ip_vs_conn_rnd) ^
- ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+ ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -141,14 +141,14 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
&cp->caddr, cp->cport, NULL, 0, &p);
if (cp->pe) {
@@ -279,7 +279,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
continue;
/* HIT */
@@ -314,33 +314,34 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
}
static int
-ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
+ int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
- struct net *net = skb_net(skb);
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return 1;
- if (likely(!inverse))
- ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ if (likely(!ip_vs_iph_inverse(iph)))
+ ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
-ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
+ const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
return NULL;
return ip_vs_conn_in_get(&p);
@@ -359,7 +360,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (unlikely(p->pe_data && p->pe->ct_match)) {
- if (!ip_vs_conn_net_eq(cp, p->net))
+ if (cp->ipvs != p->ipvs)
continue;
if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
if (__ip_vs_conn_get(cp))
@@ -377,7 +378,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (__ip_vs_conn_get(cp))
goto out;
}
@@ -418,7 +419,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
- ip_vs_conn_net_eq(cp, p->net)) {
+ cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
continue;
/* HIT */
@@ -439,12 +440,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
}
struct ip_vs_conn *
-ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
+ const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
return NULL;
return ip_vs_conn_out_get(&p);
@@ -638,7 +640,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
* so we can make the assumption that the svc_af is the same as the
* dest_af
*/
- dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr,
+ dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
cp->dport, &cp->vaddr, cp->vport,
cp->protocol, cp->fwmark, cp->flags);
if (dest) {
@@ -668,7 +670,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
#endif
ip_vs_bind_xmit(cp);
- pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
if (pd && atomic_read(&pd->appcnt))
ip_vs_bind_app(cp, pd->pp);
}
@@ -746,7 +748,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs,
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+ struct netns_ipvs *ipvs = ct->ipvs;
/*
* Checking the dest server status.
@@ -800,8 +802,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
- struct net *net = ip_vs_conn_net(cp);
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = cp->ipvs;
/*
* do I control anybody?
@@ -847,7 +848,7 @@ static void ip_vs_conn_expire(unsigned long data)
cp->timeout = 60*HZ;
if (ipvs->sync_state & IP_VS_STATE_MASTER)
- ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+ ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
}
@@ -875,8 +876,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct netns_ipvs *ipvs = net_ipvs(p->net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ struct netns_ipvs *ipvs = p->ipvs;
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
p->protocol);
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
@@ -887,7 +888,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
- ip_vs_conn_net_set(cp, p->net);
+ cp->ipvs = ipvs;
cp->af = p->af;
cp->daf = dest_af;
cp->protocol = p->protocol;
@@ -1061,7 +1062,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
size_t len = 0;
char dbuf[IP_VS_ADDRSTRLEN];
- if (!ip_vs_conn_net_eq(cp, net))
+ if (!net_eq(cp->ipvs->net, net))
return 0;
if (cp->pe_data) {
pe_data[0] = ' ';
@@ -1146,7 +1147,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_conn *cp = v;
struct net *net = seq_file_net(seq);
- if (!ip_vs_conn_net_eq(cp, net))
+ if (!net_eq(cp->ipvs->net, net))
return 0;
#ifdef CONFIG_IP_VS_IPV6
@@ -1240,7 +1241,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(struct net *net)
+void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_conn *cp, *cp_c;
@@ -1256,7 +1257,7 @@ void ip_vs_random_dropentry(struct net *net)
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
- if (!ip_vs_conn_net_eq(cp, net))
+ if (cp->ipvs != ipvs)
continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
@@ -1308,18 +1309,17 @@ void ip_vs_random_dropentry(struct net *net)
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(struct net *net)
+static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_conn *cp, *cp_c;
- struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
rcu_read_lock();
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- if (!ip_vs_conn_net_eq(cp, net))
+ if (cp->ipvs != ipvs)
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
@@ -1345,23 +1345,22 @@ flush_again:
/*
* per netns init and exit
*/
-int __net_init ip_vs_conn_net_init(struct net *net)
+int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
atomic_set(&ipvs->conn_count, 0);
- proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
- proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_fops);
return 0;
}
-void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
- ip_vs_conn_flush(net);
- remove_proc_entry("ip_vs_conn", net->proc_net);
- remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+ ip_vs_conn_flush(ipvs);
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
}
int __init ip_vs_conn_init(void)
diff --git a/kernel/net/netfilter/ipvs/ip_vs_core.c b/kernel/net/netfilter/ipvs/ip_vs_core.c
index 5d2b806a8..f57b4dcdb 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_core.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_core.c
@@ -112,7 +112,7 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
@@ -146,7 +146,7 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
@@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
struct ip_vs_cpu_stats *s;
s = this_cpu_ptr(cp->dest->stats.cpustats);
@@ -215,7 +215,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr,
vport, p);
p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
@@ -245,20 +245,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
union nf_inet_addr snet; /* source network of the client,
after masking */
+ const union nf_inet_addr *src_addr, *dst_addr;
+
+ if (likely(!ip_vs_iph_inverse(iph))) {
+ src_addr = &iph->saddr;
+ dst_addr = &iph->daddr;
+ } else {
+ src_addr = &iph->daddr;
+ dst_addr = &iph->saddr;
+ }
+
/* Mask saddr with the netmask to adjust template granularity */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ ipv6_addr_prefix(&snet.in6, &src_addr->in6,
(__force __u32) svc->netmask);
else
#endif
- snet.ip = iph->saddr.ip & svc->netmask;
+ snet.ip = src_addr->ip & svc->netmask;
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
- IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -276,7 +286,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
*/
{
int protocol = iph->protocol;
- const union nf_inet_addr *vaddr = &iph->daddr;
+ const union nf_inet_addr *vaddr = dst_addr;
__be16 vport = 0;
if (dst_port == svc->port) {
@@ -319,7 +329,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* return *ignored=0 i.e. ICMP and NF_DROP
*/
sched = rcu_dereference(svc->scheduler);
- dest = sched->schedule(svc, skb, iph);
+ if (sched) {
+ /* read svc->sched_data after svc->scheduler */
+ smp_rmb();
+ dest = sched->schedule(svc, skb, iph);
+ } else {
+ dest = NULL;
+ }
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
@@ -360,8 +376,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
- src_port, &iph->daddr, dst_port, &param);
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr,
+ src_port, dst_addr, dst_port, &param);
cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
skb->mark);
@@ -412,7 +428,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_conn *cp = NULL;
struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
- __be16 _ports[2], *pptr;
+ __be16 _ports[2], *pptr, cport, vport;
+ const void *caddr, *vaddr;
unsigned int flags;
*ignored = 1;
@@ -423,14 +440,26 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
if (pptr == NULL)
return NULL;
+ if (likely(!ip_vs_iph_inverse(iph))) {
+ cport = pptr[0];
+ caddr = &iph->saddr;
+ vport = pptr[1];
+ vaddr = &iph->daddr;
+ } else {
+ cport = pptr[1];
+ caddr = &iph->daddr;
+ vport = pptr[0];
+ vaddr = &iph->saddr;
+ }
+
/*
* FTPDATA needs this check when using local real server.
* Never schedule Active FTPDATA connections from real server.
* For LVS-NAT they must be already created. For other methods
* with persistence the connection is created on SYN+ACK.
*/
- if (pptr[0] == FTPDATA) {
- IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ if (cport == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
"Not scheduling FTPDATA");
return NULL;
}
@@ -438,19 +467,25 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Do not schedule replies from local real server.
*/
- if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
- IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
- "Not scheduling reply for existing connection");
- __ip_vs_conn_put(cp);
- return NULL;
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
+ iph->hdr_flags ^= IP_VS_HDR_INVERSE;
+ cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph);
+ iph->hdr_flags ^= IP_VS_HDR_INVERSE;
+
+ if (cp) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
+ "Not scheduling reply for existing"
+ " connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
}
/*
* Persistent service
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
- return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
iph);
*ignored = 0;
@@ -458,7 +493,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Non-persistent service
*/
- if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->fwmark && vport != svc->port) {
if (!svc->port)
pr_err("Schedule: port zero only supported "
"in persistent services, "
@@ -467,7 +502,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
sched = rcu_dereference(svc->scheduler);
- dest = sched->schedule(svc, skb, iph);
+ if (sched) {
+ /* read svc->sched_data after svc->scheduler */
+ smp_rmb();
+ dest = sched->schedule(svc, skb, iph);
+ } else {
+ dest = NULL;
+ }
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
@@ -483,11 +524,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
- &iph->saddr, pptr[0], &iph->daddr,
- pptr[1], &p);
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
+ caddr, cport, vaddr, vport, &p);
cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
- dest->port ? dest->port : pptr[1],
+ dest->port ? dest->port : vport,
flags, dest, skb->mark);
if (!cp) {
*ignored = -1;
@@ -507,6 +547,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return cp;
}
+static inline int ip_vs_addr_is_unicast(struct net *net, int af,
+ union nf_inet_addr *addr)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST;
+#endif
+ return (inet_addr_type(net, addr->ip) == RTN_UNICAST);
+}
/*
* Pass or drop the packet.
@@ -516,33 +565,21 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
- __be16 _ports[2], *pptr;
-#ifdef CONFIG_SYSCTL
- struct net *net;
- struct netns_ipvs *ipvs;
- int unicast;
-#endif
+ __be16 _ports[2], *pptr, dport;
+ struct netns_ipvs *ipvs = svc->ipvs;
+ struct net *net = ipvs->net;
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
- if (pptr == NULL) {
+ if (!pptr)
return NF_DROP;
- }
-
-#ifdef CONFIG_SYSCTL
- net = skb_net(skb);
-
-#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6)
- unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
- else
-#endif
- unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
+ dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- ipvs = net_ipvs(net);
- if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
+ !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
+ ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
int ret;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -554,7 +591,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
@@ -578,7 +615,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_conn_put(cp);
return ret;
}
-#endif
/*
* When the virtual ftp service is presented, packets destined
@@ -586,9 +622,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
+ if (svc->port == FTPPORT && dport != FTPPORT)
return NF_ACCEPT;
+ if (unlikely(ip_vs_iph_icmp(iph)))
+ return NF_DROP;
+
/*
* Notify the client that the destination is unreachable, and
* release the socket buffer.
@@ -598,11 +637,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
*/
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
- if (!skb->dev) {
- struct net *net_ = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net_->loopback_dev;
- }
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
#endif
@@ -613,15 +649,13 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_SYSCTL
-static int sysctl_snat_reroute(struct sk_buff *skb)
+static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
return ipvs->sysctl_snat_reroute;
}
-static int sysctl_nat_icmp_send(struct net *net)
+static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
return ipvs->sysctl_nat_icmp_send;
}
@@ -632,8 +666,8 @@ static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
#else
-static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
-static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
+static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
#endif
@@ -652,12 +686,13 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
return IP_DEFRAG_VS_OUT;
}
-static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
+ struct sk_buff *skb, u_int32_t user)
{
int err;
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(ipvs->net, skb, user);
local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
@@ -665,10 +700,10 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
return err;
}
-static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
- unsigned int hooknum)
+static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
+ struct sk_buff *skb, unsigned int hooknum)
{
- if (!sysctl_snat_reroute(skb))
+ if (!sysctl_snat_reroute(ipvs))
return 0;
/* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
if (NF_INET_LOCAL_IN == hooknum)
@@ -678,12 +713,12 @@ static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -836,7 +871,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
- if (ip_vs_route_me_harder(af, skb, hooknum))
+ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
/* do the statistics and put it back */
@@ -860,8 +895,8 @@ out:
* Find any that might be relevant, check against existing connections.
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
- unsigned int hooknum)
+static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum)
{
struct iphdr *iph;
struct icmphdr _icmph, *ic;
@@ -876,7 +911,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
/* reassemble IP fragments */
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -922,10 +957,10 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking outgoing ICMP for");
- ip_vs_fill_ip4hdr(cih, &ciph);
- ciph.len += offset;
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
+
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
+ cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph);
if (!cp)
return NF_ACCEPT;
@@ -935,16 +970,16 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
+static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
{
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union nf_inet_addr snet;
- unsigned int writable;
+ unsigned int offset;
*related = 1;
ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
@@ -972,31 +1007,23 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ic->icmp6_type, ntohs(icmpv6_id(ic)),
&ipvsh->saddr, &ipvsh->daddr);
- /* Now find the contained IP header */
- ciph.len = ipvsh->len + sizeof(_icmph);
- ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
- if (ip6h == NULL)
+ if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph),
+ true, &ciph))
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
- ciph.daddr.in6 = ip6h->daddr;
- /* skip possible IPv6 exthdrs of contained IPv6 packet */
- ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
- if (ciph.protocol < 0)
- return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
+ cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph);
if (!cp)
return NF_ACCEPT;
snet.in6 = ciph.saddr.in6;
- writable = ciph.len;
+ offset = ciph.len;
return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
- pp, writable, sizeof(struct ipv6hdr),
+ pp, offset, sizeof(struct ipv6hdr),
hooknum);
}
#endif
@@ -1081,7 +1108,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
- IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (!skb_make_writable(skb, iph->len))
goto drop;
@@ -1115,10 +1142,10 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* if it came from this machine itself. So re-compute
* the routing information.
*/
- if (ip_vs_route_me_harder(af, skb, hooknum))
+ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto drop;
- IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+ IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
@@ -1143,13 +1170,13 @@ drop:
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
- struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ struct sock *sk;
EnterFunction(11);
@@ -1157,29 +1184,27 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (skb->ipvs_property)
return NF_ACCEPT;
+ sk = skb_to_full_sk(skb);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ if (!ipvs->enable)
return NF_ACCEPT;
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_out_icmp_v6(skb, &related,
+ int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
hooknum, &iph);
if (related)
@@ -1189,13 +1214,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
- int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+ int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
if (related)
return verdict;
}
- pd = ip_vs_proto_data_get(net, iph.protocol);
+ pd = ip_vs_proto_data_get(ipvs, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
@@ -1205,21 +1230,21 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET)
#endif
if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
- if (ip_vs_gather_frags(skb,
+ if (ip_vs_gather_frags(ipvs, skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
- ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, &iph, 0);
+ cp = pp->conn_out_get(ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- if (sysctl_nat_icmp_send(net) &&
+ if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
@@ -1229,7 +1254,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
pptr[0])) {
/*
* Notify the real server: there is no
@@ -1246,7 +1271,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (!skb->dev)
- skb->dev = net->loopback_dev;
+ skb->dev = ipvs->net->loopback_dev;
icmpv6_send(skb,
ICMPV6_DEST_UNREACH,
ICMPV6_PORT_UNREACH,
@@ -1260,7 +1285,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
}
}
}
- IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
}
@@ -1271,10 +1296,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_reply4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
}
/*
@@ -1282,10 +1307,10 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_reply4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1296,10 +1321,10 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_reply6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET6);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
/*
@@ -1307,14 +1332,51 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_reply6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_out(ops->hooknum, skb, AF_INET6);
+ return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
#endif
+static unsigned int
+ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+
+ if (!iph->fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
+
+ /* Schedule and create new connection entry into cpp */
+ if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
+ return 0;
+ }
+
+ if (unlikely(!*cpp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
+ "ip_vs_in: packet continues traversal as normal");
+ if (iph->fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
+ "unhandled fragment");
+ }
+ *verdict = NF_ACCEPT;
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
@@ -1322,9 +1384,9 @@ ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
static int
-ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
- struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
@@ -1333,13 +1395,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip;
+ bool ipip, new_cp = false;
*related = 1;
/* reassemble IP fragments */
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1373,8 +1435,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- net = skb_net(skb);
-
/* Special case for errors for IPIP packets */
ipip = false;
if (cih->protocol == IPPROTO_IPIP) {
@@ -1390,7 +1450,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
ipip = true;
}
- pd = ip_vs_proto_data_get(net, cih->protocol);
+ pd = ip_vs_proto_data_get(ipvs, cih->protocol);
if (!pd)
return NF_ACCEPT;
pp = pd->pp;
@@ -1404,15 +1464,24 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_ip4hdr(cih, &ciph);
- ciph.len += offset;
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
offset = ciph.len;
+
/* The embedded headers contain source and dest in reverse order.
* For IPIP this is error for request, not for reply.
*/
- cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
- if (!cp)
- return NF_ACCEPT;
+ cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph);
+
+ if (!cp) {
+ int v;
+
+ if (!sysctl_schedule_icmp(ipvs))
+ return NF_ACCEPT;
+
+ if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
+ return v;
+ new_cp = true;
+ }
verdict = NF_DROP;
@@ -1443,7 +1512,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
skb_reset_network_header(skb);
IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
- ipv4_update_pmtu(skb, dev_net(skb->dev),
+ ipv4_update_pmtu(skb, ipvs->net,
mtu, 0, 0, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
@@ -1489,23 +1558,26 @@ ignore_ipip:
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
out:
- __ip_vs_conn_put(cp);
+ if (likely(!new_cp))
+ __ip_vs_conn_put(cp);
+ else
+ ip_vs_conn_put(cp);
return verdict;
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum, struct ip_vs_iphdr *iph)
+static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ int *related, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
{
- struct net *net = NULL;
- struct ipv6hdr _ip6h, *ip6h;
struct icmp6hdr _icmph, *ic;
struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
- unsigned int offs_ciph, writable, verdict;
+ unsigned int offset, verdict;
+ bool new_cp = false;
*related = 1;
@@ -1534,21 +1606,11 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
ic->icmp6_type, ntohs(icmpv6_id(ic)),
&iph->saddr, &iph->daddr);
- /* Now find the contained IP header */
- ciph.len = iph->len + sizeof(_icmph);
- offs_ciph = ciph.len; /* Save ip header offset */
- ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
- if (ip6h == NULL)
- return NF_ACCEPT; /* The packet looks wrong, ignore */
- ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
- ciph.daddr.in6 = ip6h->daddr;
- /* skip possible IPv6 exthdrs of contained IPv6 packet */
- ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
- if (ciph.protocol < 0)
- return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
-
- net = skb_net(skb);
- pd = ip_vs_proto_data_get(net, ciph.protocol);
+ offset = iph->len + sizeof(_icmph);
+ if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph))
+ return NF_ACCEPT;
+
+ pd = ip_vs_proto_data_get(ipvs, ciph.protocol);
if (!pd)
return NF_ACCEPT;
pp = pd->pp;
@@ -1557,36 +1619,49 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
if (ciph.fragoffs)
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
"Checking incoming ICMPv6 for");
/* The embedded headers contain source and dest in reverse order
* if not from localhost
*/
- cp = pp->conn_in_get(AF_INET6, skb, &ciph,
- (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+ cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph);
+
+ if (!cp) {
+ int v;
+
+ if (!sysctl_schedule_icmp(ipvs))
+ return NF_ACCEPT;
+
+ if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph))
+ return v;
+
+ new_cp = true;
+ }
- if (!cp)
- return NF_ACCEPT;
/* VS/TUN, VS/DR and LOCALNODE just let it go */
if ((hooknum == NF_INET_LOCAL_OUT) &&
(IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
- __ip_vs_conn_put(cp);
- return NF_ACCEPT;
+ verdict = NF_ACCEPT;
+ goto out;
}
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
/* Need to mangle contained IPv6 header in ICMPv6 packet */
- writable = ciph.len;
+ offset = ciph.len;
if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
IPPROTO_SCTP == ciph.protocol)
- writable += 2 * sizeof(__u16); /* Also mangle ports */
+ offset += 2 * sizeof(__u16); /* Also mangle ports */
- verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph);
- __ip_vs_conn_put(cp);
+out:
+ if (likely(!new_cp))
+ __ip_vs_conn_put(cp);
+ else
+ ip_vs_conn_put(cp);
return verdict;
}
@@ -1598,16 +1673,15 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
* and send it on its way...
*/
static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
- struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
- struct netns_ipvs *ipvs;
int conn_reuse_mode;
+ struct sock *sk;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1621,7 +1695,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
@@ -1629,20 +1703,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- ip_vs_fill_iph_skb(af, skb, &iph);
+ ip_vs_fill_iph_skb(af, skb, false, &iph);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ sk = skb_to_full_sk(skb);
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
@@ -1650,8 +1721,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
- &iph);
+ int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
+ hooknum, &iph);
if (related)
return verdict;
@@ -1660,21 +1731,30 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
- int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+ int verdict = ip_vs_in_icmp(ipvs, skb, &related,
+ hooknum);
if (related)
return verdict;
}
/* Protocol supported? */
- pd = ip_vs_proto_data_get(net, iph.protocol);
- if (unlikely(!pd))
+ pd = ip_vs_proto_data_get(ipvs, iph.protocol);
+ if (unlikely(!pd)) {
+ /* The only way we'll see this packet again is if it's
+ * encapsulated, so mark it with ipvs_property=1 so we
+ * skip it if we're ignoring tunneled packets
+ */
+ if (sysctl_ignore_tunneled(ipvs))
+ skb->ipvs_property = 1;
+
return NF_ACCEPT;
+ }
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, &iph, 0);
+ cp = pp->conn_in_get(ipvs, af, skb, &iph);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs &&
@@ -1688,32 +1768,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
cp = NULL;
}
- if (unlikely(!cp) && !iph.fragoffs) {
- /* No (second) fragments need to enter here, as nf_defrag_ipv6
- * replayed fragment zero will already have created the cp
- */
+ if (unlikely(!cp)) {
int v;
- /* Schedule and create new connection entry into &cp */
- if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
return v;
}
- if (unlikely(!cp)) {
- /* sorry, all this trouble for a no-hit :) */
- IP_VS_DBG_PKT(12, af, pp, skb, 0,
- "ip_vs_in: packet continues traversal as normal");
- if (iph.fragoffs) {
- /* Fragment that couldn't be mapped to a conn entry
- * is missing module nf_defrag_ipv6
- */
- IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
- IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
- }
- return NF_ACCEPT;
- }
+ IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
- IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
@@ -1753,7 +1816,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
pkts = atomic_add_return(1, &cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
- ip_vs_sync_conn(net, cp, pkts);
+ ip_vs_sync_conn(ipvs, cp, pkts);
ip_vs_conn_put(cp);
return ret;
@@ -1764,10 +1827,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_remote_request4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
}
/*
@@ -1775,10 +1838,10 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_request4(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1788,10 +1851,10 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_remote_request6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET6);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
/*
@@ -1799,10 +1862,10 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_local_request6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return ip_vs_in(ops->hooknum, skb, AF_INET6);
+ return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
}
#endif
@@ -1818,46 +1881,40 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
* and send them to ip_vs_in_icmp.
*/
static unsigned int
-ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
int r;
- struct net *net;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp(skb, &r, ops->hooknum);
+ return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
}
#ifdef CONFIG_IP_VS_IPV6
static unsigned int
-ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
int r;
- struct net *net;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
struct ip_vs_iphdr iphdr;
- ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
if (iphdr.protocol != IPPROTO_ICMPV6)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
- net = skb_net(skb);
- ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
+ return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
}
#endif
@@ -1866,7 +1923,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
@@ -1876,7 +1932,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* applied to IPVS. */
{
.hook = ip_vs_remote_request4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
@@ -1884,7 +1939,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
@@ -1892,7 +1946,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
@@ -1901,7 +1954,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
@@ -1909,7 +1961,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
@@ -1918,7 +1969,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 2,
@@ -1928,7 +1978,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* applied to IPVS. */
{
.hook = ip_vs_remote_request6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 1,
@@ -1936,7 +1985,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
@@ -1944,7 +1992,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 2,
@@ -1953,7 +2000,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp_v6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
@@ -1961,7 +2007,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply6,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
@@ -1987,22 +2032,22 @@ static int __net_init __ip_vs_init(struct net *net)
atomic_inc(&ipvs_netns_cnt);
net->ipvs = ipvs;
- if (ip_vs_estimator_net_init(net) < 0)
+ if (ip_vs_estimator_net_init(ipvs) < 0)
goto estimator_fail;
- if (ip_vs_control_net_init(net) < 0)
+ if (ip_vs_control_net_init(ipvs) < 0)
goto control_fail;
- if (ip_vs_protocol_net_init(net) < 0)
+ if (ip_vs_protocol_net_init(ipvs) < 0)
goto protocol_fail;
- if (ip_vs_app_net_init(net) < 0)
+ if (ip_vs_app_net_init(ipvs) < 0)
goto app_fail;
- if (ip_vs_conn_net_init(net) < 0)
+ if (ip_vs_conn_net_init(ipvs) < 0)
goto conn_fail;
- if (ip_vs_sync_net_init(net) < 0)
+ if (ip_vs_sync_net_init(ipvs) < 0)
goto sync_fail;
printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
@@ -2013,15 +2058,15 @@ static int __net_init __ip_vs_init(struct net *net)
*/
sync_fail:
- ip_vs_conn_net_cleanup(net);
+ ip_vs_conn_net_cleanup(ipvs);
conn_fail:
- ip_vs_app_net_cleanup(net);
+ ip_vs_app_net_cleanup(ipvs);
app_fail:
- ip_vs_protocol_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(ipvs);
protocol_fail:
- ip_vs_control_net_cleanup(net);
+ ip_vs_control_net_cleanup(ipvs);
control_fail:
- ip_vs_estimator_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(ipvs);
estimator_fail:
net->ipvs = NULL;
return -ENOMEM;
@@ -2029,22 +2074,25 @@ estimator_fail:
static void __net_exit __ip_vs_cleanup(struct net *net)
{
- ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
- ip_vs_conn_net_cleanup(net);
- ip_vs_app_net_cleanup(net);
- ip_vs_protocol_net_cleanup(net);
- ip_vs_control_net_cleanup(net);
- ip_vs_estimator_net_cleanup(net);
- IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(ipvs);
+ ip_vs_app_net_cleanup(ipvs);
+ ip_vs_protocol_net_cleanup(ipvs);
+ ip_vs_control_net_cleanup(ipvs);
+ ip_vs_estimator_net_cleanup(ipvs);
+ IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
net->ipvs = NULL;
}
static void __net_exit __ip_vs_dev_cleanup(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
EnterFunction(2);
- net_ipvs(net)->enable = 0; /* Disable packet reception */
+ ipvs->enable = 0; /* Disable packet reception */
smp_wmb();
- ip_vs_sync_net_cleanup(net);
+ ip_vs_sync_net_cleanup(ipvs);
LeaveFunction(2);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ctl.c b/kernel/net/netfilter/ipvs/ip_vs_ctl.c
index 285eae3a1..e7c1b052c 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_ctl.c
@@ -228,7 +228,7 @@ static void defense_work_handler(struct work_struct *work)
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
- ip_vs_random_dropentry(ipvs->net);
+ ip_vs_random_dropentry(ipvs);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
#endif
@@ -263,7 +263,7 @@ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
* Returns hash value for virtual service
*/
static inline unsigned int
-ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
register unsigned int porth = ntohs(port);
@@ -276,7 +276,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
addr->ip6[2]^addr->ip6[3];
#endif
ahash = ntohl(addr_fold);
- ahash ^= ((size_t) net >> 8);
+ ahash ^= ((size_t) ipvs >> 8);
return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
IP_VS_SVC_TAB_MASK;
@@ -285,9 +285,9 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
{
- return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+ return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
@@ -309,14 +309,14 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
* Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
&svc->addr, svc->port);
hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
@@ -357,21 +357,21 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
* Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
&& (svc->protocol == protocol)
- && net_eq(svc->net, net)) {
+ && (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
@@ -385,17 +385,17 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
- && net_eq(svc->net, net)) {
+ && (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
@@ -406,17 +406,16 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
- struct netns_ipvs *ipvs = net_ipvs(net);
/*
* Check the table hashed by fwmark first
*/
if (fwmark) {
- svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
if (svc)
goto out;
}
@@ -425,7 +424,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
@@ -435,7 +434,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
@@ -443,7 +442,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
}
out:
@@ -543,10 +542,9 @@ static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
}
/* Check if real service by <proto,addr,port> is present */
-bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
struct ip_vs_dest *dest;
@@ -601,7 +599,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
* on the backup.
* Called under RCU lock, no refcnt is returned.
*/
-struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
+struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
@@ -612,7 +610,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
struct ip_vs_service *svc;
__be16 port = dport;
- svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport);
+ svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
@@ -660,7 +658,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
const union nf_inet_addr *daddr, __be16 dport)
{
struct ip_vs_dest *dest;
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
/*
* Find the destination in trash
@@ -715,10 +713,9 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
* are expired, and the refcnt of each destination in the trash must
* be 0, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(struct net *net)
+static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
- struct netns_ipvs *ipvs = net_ipvs(net);
del_timer_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
@@ -788,7 +785,7 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
@@ -842,15 +839,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
- sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- ip_vs_start_estimator(svc->net, &dest->stats);
+ ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
- if (sched->add_dest)
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
- if (sched->upd_dest)
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
@@ -873,12 +871,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(svc->net, udest->addr.ip);
+ atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
@@ -1035,12 +1033,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
bool cleanup)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- ip_vs_stop_estimator(net, &dest->stats);
+ ip_vs_stop_estimator(ipvs, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
@@ -1078,13 +1074,13 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
svc->num_dests--;
if (dest->af != svc->af)
- net_ipvs(svc->net)->mixed_address_family_dests--;
+ svc->ipvs->mixed_address_family_dests--;
if (svcupd) {
struct ip_vs_scheduler *sched;
sched = rcu_dereference_protected(svc->scheduler, 1);
- if (sched->del_dest)
+ if (sched && sched->del_dest)
sched->del_dest(svc, dest);
}
}
@@ -1119,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete the destination
*/
- __ip_vs_del_dest(svc->net, dest, false);
+ __ip_vs_del_dest(svc->ipvs, dest, false);
LeaveFunction(2);
@@ -1128,8 +1124,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
static void ip_vs_dest_trash_expire(unsigned long data)
{
- struct net *net = (struct net *) data;
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = (struct netns_ipvs *)data;
struct ip_vs_dest *dest, *next;
unsigned long now = jiffies;
@@ -1162,24 +1157,26 @@ static void ip_vs_dest_trash_expire(unsigned long data)
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
- struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
/* Lookup the scheduler by 'u->sched_name' */
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
- ret = -ENOENT;
- goto out_err;
+ if (strcmp(u->sched_name, "none")) {
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (!sched) {
+ pr_info("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
}
if (u->pe_name && *u->pe_name) {
@@ -1233,17 +1230,19 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- svc->net = net;
+ svc->ipvs = ipvs;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret)
- goto out_err;
- sched = NULL;
+ if (sched) {
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+ }
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
@@ -1255,7 +1254,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_start_estimator(net, &svc->stats);
+ ip_vs_start_estimator(ipvs, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
@@ -1291,17 +1290,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
- struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
/*
* Lookup the scheduler, by 'u->sched_name'
*/
- sched = ip_vs_scheduler_get(u->sched_name);
- if (sched == NULL) {
- pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
- return -ENOENT;
+ if (strcmp(u->sched_name, "none")) {
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (!sched) {
+ pr_info("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ return -ENOENT;
+ }
}
old_sched = sched;
@@ -1329,14 +1331,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched != old_sched) {
+ if (old_sched) {
+ ip_vs_unbind_scheduler(svc, old_sched);
+ RCU_INIT_POINTER(svc->scheduler, NULL);
+ /* Wait all svc->sched_data users */
+ synchronize_rcu();
+ }
/* Bind the new scheduler */
- ret = ip_vs_bind_scheduler(svc, sched);
- if (ret) {
- old_sched = sched;
- goto out;
+ if (sched) {
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ ip_vs_scheduler_put(sched);
+ goto out;
+ }
}
- /* Unbind the old scheduler on success */
- ip_vs_unbind_scheduler(svc, old_sched);
}
/*
@@ -1366,7 +1374,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct netns_ipvs *ipvs = svc->ipvs;
pr_info("%s: enter\n", __func__);
@@ -1374,7 +1382,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
if (svc->af == AF_INET)
ipvs->num_services--;
- ip_vs_stop_estimator(svc->net, &svc->stats);
+ ip_vs_stop_estimator(svc->ipvs, &svc->stats);
/* Unbind scheduler */
old_sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -1390,7 +1398,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(svc->net, dest, cleanup);
+ __ip_vs_del_dest(svc->ipvs, dest, cleanup);
}
/*
@@ -1441,7 +1449,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(struct net *net, bool cleanup)
+static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
{
int idx;
struct ip_vs_service *svc;
@@ -1453,7 +1461,7 @@ static int ip_vs_flush(struct net *net, bool cleanup)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
s_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1464,7 +1472,7 @@ static int ip_vs_flush(struct net *net, bool cleanup)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
f_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1476,12 +1484,12 @@ static int ip_vs_flush(struct net *net, bool cleanup)
* Delete service by {netns} in the service table.
* Called by __ip_vs_cleanup()
*/
-void ip_vs_service_net_cleanup(struct net *net)
+void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
{
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(net, true);
+ ip_vs_flush(ipvs, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
@@ -1525,7 +1533,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net)) {
+ if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
@@ -1534,7 +1542,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (net_eq(svc->net, net)) {
+ if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
@@ -1568,26 +1576,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
return 0;
}
-static int ip_vs_zero_all(struct net *net)
+static int ip_vs_zero_all(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (net_eq(svc->net, net))
+ if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats);
return 0;
}
@@ -1600,7 +1608,7 @@ static int
proc_do_defense_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- struct net *net = current->nsproxy->net_ns;
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1611,7 +1619,7 @@ proc_do_defense_mode(struct ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level(net_ipvs(net));
+ update_defense_level(ipvs);
}
}
return rc;
@@ -1829,6 +1837,18 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "schedule_icmp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ignore_tunneled",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1874,6 +1894,7 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
@@ -1881,7 +1902,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
- if (net_eq(svc->net, net) && pos-- == 0) {
+ if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1893,7 +1914,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
f_list) {
- if (net_eq(svc->net, net) && pos-- == 0) {
+ if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1982,6 +2003,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+ char *sched_name = sched ? sched->name : "none";
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -1990,18 +2012,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- sched->name);
+ sched_name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- sched->name,
+ sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, sched->name,
+ svc->fwmark, sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -2180,7 +2202,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = {
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
@@ -2193,13 +2215,13 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
@@ -2207,7 +2229,7 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
@@ -2319,24 +2341,34 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- mutex_lock(&ipvs->sync_mutex);
- if (cmd == IP_VS_SO_SET_STARTDAEMON)
- ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
- dm->syncid);
- else
- ret = stop_sync_thread(net, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
+ if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+ struct ipvs_sync_daemon_cfg cfg;
+
+ memset(&cfg, 0, sizeof(cfg));
+ strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
+ sizeof(cfg.mcast_ifn));
+ cfg.syncid = dm->syncid;
+ rtnl_lock();
+ mutex_lock(&ipvs->sync_mutex);
+ ret = start_sync_thread(ipvs, &cfg, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+ } else {
+ mutex_lock(&ipvs->sync_mutex);
+ ret = stop_sync_thread(ipvs, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ }
goto out_dec;
}
mutex_lock(&__ip_vs_mutex);
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush(net, false);
+ ret = ip_vs_flush(ipvs, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
}
@@ -2351,7 +2383,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all(net);
+ ret = ip_vs_zero_all(ipvs);
goto out_unlock;
}
}
@@ -2369,10 +2401,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
rcu_read_lock();
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
@@ -2386,7 +2418,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(net, &usvc, &svc);
+ ret = ip_vs_add_service(ipvs, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2427,13 +2459,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
struct ip_vs_scheduler *sched;
struct ip_vs_kstats kstats;
+ char *sched_name;
sched = rcu_dereference_protected(src->scheduler, 1);
+ sched_name = sched ? sched->name : "none";
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2443,7 +2477,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(struct net *net,
+__ip_vs_get_service_entries(struct netns_ipvs *ipvs,
const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
@@ -2455,7 +2489,7 @@ __ip_vs_get_service_entries(struct net *net,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET || !net_eq(svc->net, net))
+ if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
@@ -2474,7 +2508,7 @@ __ip_vs_get_service_entries(struct net *net,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET || !net_eq(svc->net, net))
+ if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
@@ -2494,7 +2528,7 @@ out:
}
static inline int
-__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
@@ -2503,9 +2537,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
rcu_read_lock();
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
get->port);
rcu_read_unlock();
@@ -2550,7 +2584,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
@@ -2559,12 +2593,12 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
memset(u, 0, sizeof (*u));
#ifdef CONFIG_IP_VS_PROTO_TCP
- pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
- pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
u->udp_timeout =
pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
@@ -2627,15 +2661,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
- d[0].syncid = ipvs->master_syncid;
+ d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
- d[1].syncid = ipvs->backup_syncid;
+ d[1].syncid = ipvs->bcfg.syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
@@ -2683,7 +2717,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(net, get, user);
+ ret = __ip_vs_get_service_entries(ipvs, get, user);
}
break;
@@ -2697,9 +2731,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
addr.ip = entry->addr;
rcu_read_lock();
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(net, AF_INET,
+ svc = __ip_vs_service_find(ipvs, AF_INET,
entry->protocol, &addr,
entry->port);
rcu_read_unlock();
@@ -2725,7 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(net, get, user);
+ ret = __ip_vs_get_dest_entries(ipvs, get, user);
}
break;
@@ -2733,7 +2767,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
@@ -2790,6 +2824,11 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
[IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
.len = IP_VS_IFNAME_MAXLEN },
[IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
+ [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
+ [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
+ [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
@@ -2892,6 +2931,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
struct ip_vs_kstats kstats;
+ char *sched_name;
nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
if (!nl_service)
@@ -2910,8 +2950,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
}
sched = rcu_dereference_protected(svc->scheduler, 1);
+ sched_name = sched ? sched->name : "none";
pe = rcu_dereference_protected(svc->pe, 1);
- if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
(pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
@@ -2961,12 +3002,13 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start || !net_eq(svc->net, net))
+ if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2977,7 +3019,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start || !net_eq(svc->net, net))
+ if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2993,7 +3035,7 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct net *net,
+static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
@@ -3038,9 +3080,9 @@ static int ip_vs_genl_parse_service(struct net *net,
rcu_read_lock();
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
rcu_read_unlock();
*ret_svc = svc;
@@ -3078,14 +3120,14 @@ static int ip_vs_genl_parse_service(struct net *net,
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -3160,7 +3202,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
@@ -3170,7 +3213,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
goto out_err;
- svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -3246,7 +3289,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
}
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
- const char *mcast_ifn, __u32 syncid)
+ struct ipvs_sync_daemon_cfg *c)
{
struct nlattr *nl_daemon;
@@ -3255,9 +3298,23 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
return -EMSGSIZE;
if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
- nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
- nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
+ nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
+ nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
+ nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
goto nla_put_failure;
+#ifdef CONFIG_IP_VS_IPV6
+ if (c->mcast_af == AF_INET6) {
+ if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
+ &c->mcast_group.in6))
+ goto nla_put_failure;
+ } else
+#endif
+ if (c->mcast_af == AF_INET &&
+ nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
+ c->mcast_group.ip))
+ goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
@@ -3268,7 +3325,7 @@ nla_put_failure:
}
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
- const char *mcast_ifn, __u32 syncid,
+ struct ipvs_sync_daemon_cfg *c,
struct netlink_callback *cb)
{
void *hdr;
@@ -3278,7 +3335,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
if (!hdr)
return -EMSGSIZE;
- if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+ if (ip_vs_genl_fill_daemon(skb, state, c))
goto nla_put_failure;
genlmsg_end(skb, hdr);
@@ -3292,14 +3349,13 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct net *net = skb_sknet(skb);
+ struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ipvs->master_mcast_ifn,
- ipvs->master_syncid, cb) < 0)
+ &ipvs->mcfg, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
@@ -3307,8 +3363,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ipvs->backup_mcast_ifn,
- ipvs->backup_syncid, cb) < 0)
+ &ipvs->bcfg, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
@@ -3320,39 +3375,90 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
+ struct ipvs_sync_daemon_cfg c;
+ struct nlattr *a;
+ int ret;
+
+ memset(&c, 0, sizeof(c));
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
+ strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ sizeof(c.mcast_ifn));
+ c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
+
+ a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
+ if (a)
+ c.sync_maxlen = nla_get_u16(a);
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
+ if (a) {
+ c.mcast_af = AF_INET;
+ c.mcast_group.ip = nla_get_in_addr(a);
+ if (!ipv4_is_multicast(c.mcast_group.ip))
+ return -EINVAL;
+ } else {
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
+ if (a) {
+#ifdef CONFIG_IP_VS_IPV6
+ int addr_type;
+
+ c.mcast_af = AF_INET6;
+ c.mcast_group.in6 = nla_get_in6_addr(a);
+ addr_type = ipv6_addr_type(&c.mcast_group.in6);
+ if (!(addr_type & IPV6_ADDR_MULTICAST))
+ return -EINVAL;
+#else
+ return -EAFNOSUPPORT;
+#endif
+ }
+ }
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
+ if (a)
+ c.mcast_port = nla_get_u16(a);
+
+ a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
+ if (a)
+ c.mcast_ttl = nla_get_u8(a);
/* The synchronization protocol is incompatible with mixed family
* services
*/
- if (net_ipvs(net)->mixed_address_family_dests > 0)
+ if (ipvs->mixed_address_family_dests > 0)
return -EINVAL;
- return start_sync_thread(net,
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
- nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+ rtnl_lock();
+ mutex_lock(&ipvs->sync_mutex);
+ ret = start_sync_thread(ipvs, &c,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_unlock(&ipvs->sync_mutex);
+ rtnl_unlock();
+ return ret;
}
-static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
+ int ret;
+
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(net,
- nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_lock(&ipvs->sync_mutex);
+ ret = stop_sync_thread(ipvs,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
}
-static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3364,38 +3470,33 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(net, &t);
+ return ip_vs_set_timeout(ipvs, &t);
}
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
- int ret = 0, cmd;
- struct net *net;
- struct netns_ipvs *ipvs;
+ int ret = -EINVAL, cmd;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
- ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
- mutex_lock(&ipvs->sync_mutex);
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
info->attrs[IPVS_CMD_ATTR_DAEMON],
- ip_vs_daemon_policy)) {
- ret = -EINVAL;
+ ip_vs_daemon_policy))
goto out;
- }
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+ ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(net, daemon_attrs);
-out:
- mutex_unlock(&ipvs->sync_mutex);
+ ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
}
+
+out:
return ret;
}
@@ -3406,22 +3507,22 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
- struct net *net;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush(net, false);
+ ret = ip_vs_flush(ipvs, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(net, info->attrs);
+ ret = ip_vs_genl_set_config(ipvs, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all(net);
+ ret = ip_vs_zero_all(ipvs);
goto out;
}
@@ -3431,7 +3532,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(net, &usvc,
+ ret = ip_vs_genl_parse_service(ipvs, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3470,7 +3571,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
/* The synchronization protocol is incompatible
* with mixed family services
*/
- if (net_ipvs(net)->sync_state) {
+ if (ipvs->sync_state) {
ret = -EINVAL;
goto out;
}
@@ -3490,7 +3591,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(net, &usvc, &svc);
+ ret = ip_vs_add_service(ipvs, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3528,9 +3629,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
- struct net *net;
+ struct net *net = sock_net(skb->sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3559,7 +3660,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(net,
+ svc = ip_vs_genl_find_service(ipvs,
info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
@@ -3580,7 +3681,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(net, &t);
+ __ip_vs_get_timeouts(ipvs, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
t.tcp_timeout) ||
@@ -3735,10 +3836,10 @@ static void ip_vs_genl_unregister(void)
* per netns intit/exit func.
*/
#ifdef CONFIG_SYSCTL
-static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
+ struct net *net = ipvs->net;
int idx;
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ctl_table *tbl;
atomic_set(&ipvs->dropentry, 0);
@@ -3757,6 +3858,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
+ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
+ if (tbl[idx].proc_handler == proc_do_defense_mode)
+ tbl[idx].extra2 = ipvs;
+ }
idx = 0;
ipvs->sysctl_amemthresh = 1024;
tbl[idx++].data = &ipvs->sysctl_amemthresh;
@@ -3798,7 +3903,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_backup_only;
ipvs->sysctl_conn_reuse_mode = 1;
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
-
+ tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
+ tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
@@ -3806,7 +3912,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
kfree(tbl);
return -ENOMEM;
}
- ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
ipvs->sysctl_tbl = tbl;
/* Schedule defense work */
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
@@ -3815,14 +3921,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
return 0;
}
-static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(net, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -3830,8 +3936,8 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
#else
-static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
-static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
#endif
@@ -3839,10 +3945,10 @@ static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
};
-int __net_init ip_vs_control_net_init(struct net *net)
+int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
+ struct net *net = ipvs->net;
int i, idx;
- struct netns_ipvs *ipvs = net_ipvs(net);
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -3851,7 +3957,7 @@ int __net_init ip_vs_control_net_init(struct net *net)
INIT_LIST_HEAD(&ipvs->dest_trash);
spin_lock_init(&ipvs->dest_trash_lock);
setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
- (unsigned long) net);
+ (unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
@@ -3873,7 +3979,7 @@ int __net_init ip_vs_control_net_init(struct net *net)
proc_create("ip_vs_stats_percpu", 0, net->proc_net,
&ip_vs_stats_percpu_fops);
- if (ip_vs_control_net_init_sysctl(net))
+ if (ip_vs_control_net_init_sysctl(ipvs))
goto err;
return 0;
@@ -3883,12 +3989,12 @@ err:
return -ENOMEM;
}
-void __net_exit ip_vs_control_net_cleanup(struct net *net)
+void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net *net = ipvs->net;
- ip_vs_trash_cleanup(net);
- ip_vs_control_net_cleanup_sysctl(net);
+ ip_vs_trash_cleanup(ipvs);
+ ip_vs_control_net_cleanup_sysctl(ipvs);
remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
remove_proc_entry("ip_vs_stats", net->proc_net);
remove_proc_entry("ip_vs", net->proc_net);
diff --git a/kernel/net/netfilter/ipvs/ip_vs_est.c b/kernel/net/netfilter/ipvs/ip_vs_est.c
index ef0eb0a8d..457c6c193 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_est.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_est.c
@@ -102,10 +102,8 @@ static void estimation_timer(unsigned long arg)
struct ip_vs_estimator *e;
struct ip_vs_stats *s;
u64 rate;
- struct net *net = (struct net *)arg;
- struct netns_ipvs *ipvs;
+ struct netns_ipvs *ipvs = (struct netns_ipvs *)arg;
- ipvs = net_ipvs(net);
spin_lock(&ipvs->est_lock);
list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
@@ -140,9 +138,8 @@ static void estimation_timer(unsigned long arg)
mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
@@ -152,9 +149,8 @@ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
spin_lock_bh(&ipvs->est_lock);
@@ -192,18 +188,16 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
dst->outbps = (e->outbps + 0xF) >> 5;
}
-int __net_init ip_vs_estimator_net_init(struct net *net)
+int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
INIT_LIST_HEAD(&ipvs->est_list);
spin_lock_init(&ipvs->est_lock);
- setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs);
mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
-void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
+void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
{
- del_timer_sync(&net_ipvs(net)->est_timer);
+ del_timer_sync(&ipvs->est_timer);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ftp.c b/kernel/net/netfilter/ipvs/ip_vs_ftp.c
index 5d3daae98..d30c327bb 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_ftp.c
@@ -181,7 +181,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
- struct net *net;
*diff = 0;
@@ -223,14 +222,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ ip_vs_conn_fill_param(cp->ipvs, AF_INET,
iph->protocol, &from, port,
&cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ ip_vs_conn_fill_param(cp->ipvs,
AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
/* As above, this is ipv4 only */
@@ -289,9 +288,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
- net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -320,7 +318,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
- struct net *net;
/* no diff required for incoming packets */
*diff = 0;
@@ -392,7 +389,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ ip_vs_conn_fill_param(cp->ipvs, AF_INET,
iph->protocol, &to, port, &cp->vaddr,
htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
@@ -413,8 +410,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- net = skb_net(skb);
- ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -447,14 +443,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
if (!ipvs)
return -ENOENT;
- app = register_ip_vs_app(net, &ip_vs_ftp);
+ app = register_ip_vs_app(ipvs, &ip_vs_ftp);
if (IS_ERR(app))
return PTR_ERR(app);
for (i = 0; i < ports_count; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
pr_info("%s: loaded support on port[%d] = %d\n",
@@ -463,7 +459,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
return 0;
err_unreg:
- unregister_ip_vs_app(net, &ip_vs_ftp);
+ unregister_ip_vs_app(ipvs, &ip_vs_ftp);
return ret;
}
/*
@@ -471,7 +467,12 @@ err_unreg:
*/
static void __ip_vs_ftp_exit(struct net *net)
{
- unregister_ip_vs_app(net, &ip_vs_ftp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return;
+
+ unregister_ip_vs_app(ipvs, &ip_vs_ftp);
}
static struct pernet_operations ip_vs_ftp_ops = {
diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblc.c b/kernel/net/netfilter/ipvs/ip_vs_lblc.c
index 127f14046..cccf4d637 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_lblc.c
@@ -250,8 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
static int sysctl_lblc_expiration(struct ip_vs_service *svc)
{
#ifdef CONFIG_SYSCTL
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
- return ipvs->sysctl_lblc_expiration;
+ return svc->ipvs->sysctl_lblc_expiration;
#else
return DEFAULT_EXPIRATION;
#endif
diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
index 2229d2d8b..796d70e47 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -415,8 +415,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
{
#ifdef CONFIG_SYSCTL
- struct netns_ipvs *ipvs = net_ipvs(svc->net);
- return ipvs->sysctl_lblcr_expiration;
+ return svc->ipvs->sysctl_lblcr_expiration;
#else
return DEFAULT_EXPIRATION;
#endif
diff --git a/kernel/net/netfilter/ipvs/ip_vs_nfct.c b/kernel/net/netfilter/ipvs/ip_vs_nfct.c
index 5882bbfd1..30434fb13 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_nfct.c
@@ -161,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -274,8 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
- &tuple);
+ h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/kernel/net/netfilter/ipvs/ip_vs_ovf.c b/kernel/net/netfilter/ipvs/ip_vs_ovf.c
new file mode 100644
index 000000000..f7d62c3b7
--- /dev/null
+++ b/kernel/net/netfilter/ipvs/ip_vs_ovf.c
@@ -0,0 +1,86 @@
+/*
+ * IPVS: Overflow-Connection Scheduling module
+ *
+ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Scheduler implements "overflow" loadbalancing according to number of active
+ * connections , will keep all conections to the node with the highest weight
+ * and overflow to the next node if the number of connections exceeds the node's
+ * weight.
+ * Note that this scheduler might not be suitable for UDP because it only uses
+ * active connections
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/* OVF Connection scheduling */
+static struct ip_vs_dest *
+ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *h = NULL;
+ int hw = 0, w;
+
+ IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n");
+ /* select the node with highest weight, go to next in line if active
+ * connections exceed weight
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ w = atomic_read(&dest->weight);
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->activeconns) > w ||
+ w == 0)
+ continue;
+ if (!h || w > hw) {
+ h = dest;
+ hw = w;
+ }
+ }
+
+ if (h) {
+ IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n",
+ IP_VS_DBG_ADDR(h->af, &h->addr),
+ ntohs(h->port),
+ atomic_read(&h->activeconns),
+ atomic_read(&h->weight));
+ return h;
+ }
+
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+}
+
+static struct ip_vs_scheduler ip_vs_ovf_scheduler = {
+ .name = "ovf",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list),
+ .schedule = ip_vs_ovf_schedule,
+};
+
+static int __init ip_vs_ovf_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_ovf_scheduler);
+}
+
+static void __exit ip_vs_ovf_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_ovf_init);
+module_exit(ip_vs_ovf_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
index bed5f7042..1b8d594e4 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -70,7 +70,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
const char *dptr;
int retc;
- ip_vs_fill_iph_skb(p->af, skb, &iph);
+ ip_vs_fill_iph_skb(p->af, skb, false, &iph);
/* Only useful with UDP */
if (iph.protocol != IPPROTO_UDP)
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto.c b/kernel/net/netfilter/ipvs/ip_vs_proto.c
index 939f7fbe9..8ae480715 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto.c
@@ -63,9 +63,8 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
* register an ipvs protocols netns related data
*/
static int
-register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
struct ip_vs_proto_data *pd =
kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
@@ -79,7 +78,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
atomic_set(&pd->appcnt, 0); /* Init app counter */
if (pp->init_netns != NULL) {
- int ret = pp->init_netns(net, pd);
+ int ret = pp->init_netns(ipvs, pd);
if (ret) {
/* unlink an free proto data */
ipvs->proto_data_table[hash] = pd->next;
@@ -116,9 +115,8 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
* unregister an ipvs protocols netns data
*/
static int
-unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data **pd_p;
unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
@@ -127,7 +125,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
if (*pd_p == pd) {
*pd_p = pd->next;
if (pd->pp->exit_netns != NULL)
- pd->pp->exit_netns(net, pd);
+ pd->pp->exit_netns(ipvs, pd);
kfree(pd);
return 0;
}
@@ -156,8 +154,8 @@ EXPORT_SYMBOL(ip_vs_proto_get);
/*
* get ip_vs_protocol object data by netns and proto
*/
-static struct ip_vs_proto_data *
-__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
{
struct ip_vs_proto_data *pd;
unsigned int hash = IP_VS_PROTO_HASH(proto);
@@ -169,14 +167,6 @@ __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
return NULL;
}
-
-struct ip_vs_proto_data *
-ip_vs_proto_data_get(struct net *net, unsigned short proto)
-{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- return __ipvs_proto_data_get(ipvs, proto);
-}
EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
@@ -317,7 +307,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
/*
* per network name-space init
*/
-int __net_init ip_vs_protocol_net_init(struct net *net)
+int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs)
{
int i, ret;
static struct ip_vs_protocol *protos[] = {
@@ -339,27 +329,26 @@ int __net_init ip_vs_protocol_net_init(struct net *net)
};
for (i = 0; i < ARRAY_SIZE(protos); i++) {
- ret = register_ip_vs_proto_netns(net, protos[i]);
+ ret = register_ip_vs_proto_netns(ipvs, protos[i]);
if (ret < 0)
goto cleanup;
}
return 0;
cleanup:
- ip_vs_protocol_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(ipvs);
return ret;
}
-void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd;
int i;
/* unregister all the ipvs proto data for this netns */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pd = ipvs->proto_data_table[i]) != NULL)
- unregister_ip_vs_proto_netns(net, pd);
+ unregister_ip_vs_proto_netns(ipvs, pd);
}
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5de3dd312..5320d3997 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,30 +41,28 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(struct net *net, int af,
- const struct ip_vs_iphdr *iph, int inverse,
+ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af,
+ const struct ip_vs_iphdr *iph,
struct ip_vs_conn_param *p)
{
- if (likely(!inverse))
- ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ if (likely(!ip_vs_iph_inverse(iph)))
+ ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph,
- int inverse)
+ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
- struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(ipvs, af, iph, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -73,7 +71,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb,
*/
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
- inverse ? "ICMP+" : "",
+ ip_vs_iph_icmp(iph) ? "ICMP+" : "",
ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
@@ -84,19 +82,18 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb,
static struct ip_vs_conn *
-ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, int inverse)
+ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
- struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(ipvs, af, iph, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
- inverse ? "ICMP+" : "",
+ ip_vs_iph_icmp(iph) ? "ICMP+" : "",
ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
@@ -107,7 +104,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 5b84c0b56..010ddeec1 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,35 +9,44 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
- struct netns_ipvs *ipvs;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
-
- sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (sh == NULL) {
- *verdict = NF_DROP;
- return 0;
+ __be16 _ports[2], *ports = NULL;
+
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh) {
+ sch = skb_header_pointer(
+ skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(_schunkh), &_schunkh);
+ if (sch && (sch->type == SCTP_CID_INIT ||
+ sysctl_sloppy_sctp(ipvs)))
+ ports = &sh->source;
+ }
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
}
- sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
- sizeof(_schunkh), &_schunkh);
- if (sch == NULL) {
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
- ipvs = net_ipvs(net);
rcu_read_lock();
- if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
- (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, sh->dest))) {
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+ if (svc) {
int ignored;
if (ip_vs_todrop(ipvs)) {
@@ -474,14 +483,13 @@ static inline __u16 sctp_app_hashkey(__be16 port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
@@ -498,9 +506,9 @@ out:
return ret;
}
-static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -508,7 +516,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -549,10 +557,8 @@ out:
* timeouts is netns related now.
* ---------------------------------------------
*/
-static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
@@ -561,7 +567,7 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 8e92beb0c..d7024b2ed 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -32,27 +32,47 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
- struct netns_ipvs *ipvs;
+ __be16 _ports[2], *ports = NULL;
- th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
- if (th == NULL) {
+ /* In the event of icmp, we're only guaranteed to have the first 8
+ * bytes of the transport header, so we only check the rest of the
+ * TCP packet for non-ICMP packets
+ */
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th) {
+ if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
+ return 1;
+ ports = &th->source;
+ }
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
+ }
+
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
- ipvs = net_ipvs(net);
+
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
rcu_read_lock();
- if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
- (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, th->dest))) {
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+
+ if (svc) {
int ignored;
if (ip_vs_todrop(ipvs)) {
@@ -571,14 +591,13 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
@@ -597,9 +616,9 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
-tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -609,7 +628,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -653,9 +672,9 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
@@ -668,10 +687,8 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
* timeouts is netns related now.
* ---------------------------------------------
*/
-static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
@@ -681,7 +698,7 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
index b62a3c0ff..e494e9a88 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -29,28 +29,42 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
- struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
+ __be16 _ports[2], *ports = NULL;
- /* IPv6 fragments, only first fragment will hit this */
- uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
- if (uh == NULL) {
+ if (likely(!ip_vs_iph_icmp(iph))) {
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (uh)
+ ports = &uh->source;
+ } else {
+ ports = skb_header_pointer(
+ skb, iph->len, sizeof(_ports), &_ports);
+ }
+
+ if (!ports) {
*verdict = NF_DROP;
return 0;
}
- net = skb_net(skb);
+
rcu_read_lock();
- svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
- &iph->daddr, uh->dest);
+ if (likely(!ip_vs_iph_inverse(iph)))
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->daddr, ports[1]);
+ else
+ svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
+ &iph->saddr, ports[0]);
+
if (svc) {
int ignored;
- if (ip_vs_todrop(net_ipvs(net))) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -348,14 +362,13 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
hash = udp_app_hashkey(port);
@@ -374,9 +387,9 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
static void
-udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
{
- struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
atomic_dec(&pd->appcnt);
list_del_rcu(&inc->p_list);
@@ -385,7 +398,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc)
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct netns_ipvs *ipvs = cp->ipvs;
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -456,10 +469,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
}
-static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
@@ -468,7 +479,7 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
return 0;
}
-static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
{
kfree(pd->timeout_table);
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sched.c b/kernel/net/netfilter/ipvs/ip_vs_sched.c
index 199760c71..a2ff7d746 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sched.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sched.c
@@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
if (sched->done_service)
sched->done_service(svc);
- /* svc->scheduler can not be set to NULL */
+ /* svc->scheduler can be set to NULL only by caller */
}
@@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler && scheduler->module)
+ if (scheduler)
module_put(scheduler->module);
}
@@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
{
- struct ip_vs_scheduler *sched;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+ char *sched_name = sched ? sched->name : "none";
- sched = rcu_dereference(svc->scheduler);
if (svc->fwmark) {
IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
- sched->name, svc->fwmark, svc->fwmark, msg);
+ sched_name, svc->fwmark, svc->fwmark, msg);
#ifdef CONFIG_IP_VS_IPV6
} else if (svc->af == AF_INET6) {
IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
- sched->name, ip_vs_proto_name(svc->protocol),
+ sched_name, ip_vs_proto_name(svc->protocol),
&svc->addr.in6, ntohs(svc->port), msg);
#endif
} else {
IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
- sched->name, ip_vs_proto_name(svc->protocol),
+ sched_name, ip_vs_proto_name(svc->protocol),
&svc->addr.ip, ntohs(svc->port), msg);
}
}
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sh.c b/kernel/net/netfilter/ipvs/ip_vs_sh.c
index 98a13433b..1e373a5e4 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sh.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sh.c
@@ -280,35 +280,29 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
static inline __be16
ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
- __be16 port;
- struct tcphdr _tcph, *th;
- struct udphdr _udph, *uh;
- sctp_sctphdr_t _sctph, *sh;
+ __be16 _ports[2], *ports;
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
switch (iph->protocol) {
case IPPROTO_TCP:
- th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
- if (unlikely(th == NULL))
- return 0;
- port = th->source;
- break;
case IPPROTO_UDP:
- uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
- if (unlikely(uh == NULL))
- return 0;
- port = uh->source;
- break;
case IPPROTO_SCTP:
- sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
- if (unlikely(sh == NULL))
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
return 0;
- port = sh->source;
- break;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
default:
- port = 0;
+ return 0;
}
-
- return port;
}
@@ -322,6 +316,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_dest *dest;
struct ip_vs_sh_state *s;
__be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
@@ -331,9 +328,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
s = (struct ip_vs_sh_state *) svc->sched_data;
if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
- dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port);
else
- dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+ dest = ip_vs_sh_get(svc, s, hash_addr, port);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
@@ -341,7 +338,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph->saddr),
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
diff --git a/kernel/net/netfilter/ipvs/ip_vs_sync.c b/kernel/net/netfilter/ipvs/ip_vs_sync.c
index 19b9cce6c..803001a45 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_sync.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_sync.c
@@ -193,7 +193,7 @@ union ip_vs_sync_conn {
#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
struct ip_vs_sync_thread_data {
- struct net *net;
+ struct netns_ipvs *ipvs;
struct socket *sock;
char *buf;
int id;
@@ -262,6 +262,11 @@ struct ip_vs_sync_mesg {
/* ip_vs_sync_conn entries start here */
};
+union ipvs_sockaddr {
+ struct sockaddr_in in;
+ struct sockaddr_in6 in6;
+};
+
struct ip_vs_sync_buff {
struct list_head list;
unsigned long firstuse;
@@ -320,26 +325,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
* Create a new sync buffer for Version 1 proto.
*/
static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
+ ipvs->mcfg.sync_maxlen);
+ sb->mesg = kmalloc(len, GFP_ATOMIC);
if (!sb->mesg) {
kfree(sb);
return NULL;
}
sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
- sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->syncid = ipvs->mcfg.syncid;
sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
sb->mesg->nr_conns = 0;
sb->mesg->spare = 0;
sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
- sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+ sb->end = (unsigned char *)sb->mesg + len;
sb->firstuse = jiffies;
return sb;
@@ -402,7 +409,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
* Create a new sync buffer for Version 0 proto.
*/
static inline struct ip_vs_sync_buff *
-ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
{
struct ip_vs_sync_buff *sb;
struct ip_vs_sync_mesg_v0 *mesg;
@@ -410,17 +417,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
+ ipvs->mcfg.sync_maxlen);
+ sb->mesg = kmalloc(len, GFP_ATOMIC);
if (!sb->mesg) {
kfree(sb);
return NULL;
}
mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
mesg->nr_conns = 0;
- mesg->syncid = ipvs->master_syncid;
+ mesg->syncid = ipvs->mcfg.syncid;
mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
- sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->end = (unsigned char *)mesg + len;
sb->firstuse = jiffies;
return sb;
}
@@ -524,16 +533,15 @@ set:
* Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
*/
-static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
int pkts)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg_v0 *m;
struct ip_vs_sync_conn_v0 *s;
struct ip_vs_sync_buff *buff;
struct ipvs_master_sync_state *ms;
int id;
- int len;
+ unsigned int len;
if (unlikely(cp->af != AF_INET))
return;
@@ -553,17 +561,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
id = select_master_thread_id(ipvs, cp);
ms = &ipvs->ms[id];
buff = ms->sync_buff;
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
if (buff) {
m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
/* Send buffer if it is for v1 */
- if (!m->nr_conns) {
+ if (buff->head + len > buff->end || !m->nr_conns) {
sb_queue_tail(ipvs, ms);
ms->sync_buff = NULL;
buff = NULL;
}
}
if (!buff) {
- buff = ip_vs_sync_buff_create_v0(ipvs);
+ buff = ip_vs_sync_buff_create_v0(ipvs, len);
if (!buff) {
spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
@@ -572,8 +582,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
ms->sync_buff = buff;
}
- len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
- SIMPLE_CONN_SIZE;
m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
s = (struct ip_vs_sync_conn_v0 *) buff->head;
@@ -597,12 +605,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
m->nr_conns++;
m->size = htons(ntohs(m->size) + len);
buff->head += len;
-
- /* check if there is a space for next one */
- if (buff->head + FULL_CONN_SIZE > buff->end) {
- sb_queue_tail(ipvs, ms);
- ms->sync_buff = NULL;
- }
spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
@@ -612,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
pkts = atomic_add_return(1, &cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
- ip_vs_sync_conn(net, cp->control, pkts);
+ ip_vs_sync_conn(ipvs, cp, pkts);
}
}
@@ -621,9 +623,8 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
* Called by ip_vs_in.
* Sending Version 1 messages
*/
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m;
union ip_vs_sync_conn *s;
struct ip_vs_sync_buff *buff;
@@ -634,7 +635,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
/* Handle old version of the protocol */
if (sysctl_sync_ver(ipvs) == 0) {
- ip_vs_sync_conn_v0(net, cp, pkts);
+ ip_vs_sync_conn_v0(ipvs, cp, pkts);
return;
}
/* Do not sync ONE PACKET */
@@ -694,7 +695,7 @@ sloop:
}
if (!buff) {
- buff = ip_vs_sync_buff_create(ipvs);
+ buff = ip_vs_sync_buff_create(ipvs, len);
if (!buff) {
spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
@@ -781,21 +782,21 @@ control:
* fill_param used by version 1
*/
static inline int
-ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
struct ip_vs_conn_param *p,
__u8 *pe_data, unsigned int pe_data_len,
__u8 *pe_name, unsigned int pe_name_len)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
(const union nf_inet_addr *)&sc->v6.caddr,
sc->v6.cport,
(const union nf_inet_addr *)&sc->v6.vaddr,
sc->v6.vport, p);
else
#endif
- ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
(const union nf_inet_addr *)&sc->v4.caddr,
sc->v4.cport,
(const union nf_inet_addr *)&sc->v4.vaddr,
@@ -834,7 +835,7 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
* Param: ...
* timeout is in sec.
*/
-static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
unsigned int flags, unsigned int state,
unsigned int protocol, unsigned int type,
const union nf_inet_addr *daddr, __be16 dport,
@@ -843,7 +844,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
{
struct ip_vs_dest *dest;
struct ip_vs_conn *cp;
- struct netns_ipvs *ipvs = net_ipvs(net);
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
cp = ip_vs_conn_in_get(param);
@@ -901,7 +901,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
* with synchronization, so we can make the assumption that
* the svc_af is the same as the dest_af
*/
- dest = ip_vs_find_dest(net, type, type, daddr, dport,
+ dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
param->vaddr, param->vport, protocol,
fwmark, flags);
@@ -938,7 +938,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
} else {
struct ip_vs_proto_data *pd;
- pd = ip_vs_proto_data_get(net, protocol);
+ pd = ip_vs_proto_data_get(ipvs, protocol);
if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
cp->timeout = pd->timeout_table[state];
else
@@ -950,7 +950,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
/*
* Process received multicast message for Version 0
*/
-static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
const size_t buflen)
{
struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
@@ -1006,14 +1006,14 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer,
}
}
- ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
(const union nf_inet_addr *)&s->caddr,
s->cport,
(const union nf_inet_addr *)&s->vaddr,
s->vport, &param);
/* Send timeout as Zero */
- ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->protocol, AF_INET,
(union nf_inet_addr *)&s->daddr, s->dport,
0, 0, opt);
}
@@ -1064,7 +1064,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
/*
* Process a Version 1 sync. connection
*/
-static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
{
struct ip_vs_sync_conn_options opt;
union ip_vs_sync_conn *s;
@@ -1168,21 +1168,21 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
state = 0;
}
}
- if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
pe_data_len, pe_name, pe_name_len)) {
retc = 50;
goto out;
}
/* If only IPv4, just silent skip IPv6 */
if (af == AF_INET)
- ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->v4.protocol, af,
(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
);
#ifdef CONFIG_IP_VS_IPV6
else
- ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ ip_vs_proc_conn(ipvs, &param, flags, state, s->v6.protocol, af,
(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
@@ -1201,10 +1201,9 @@ out:
* ip_vs_conn entries.
* Handles Version 0 & 1
*/
-static void ip_vs_process_message(struct net *net, __u8 *buffer,
+static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
const size_t buflen)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
__u8 *p, *msg_end;
int i, nr_conns;
@@ -1219,7 +1218,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
return;
}
/* SyncID sanity check */
- if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
return;
}
@@ -1254,7 +1253,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
return;
}
/* Process a single sync_conn */
- retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
if (retc < 0) {
IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
retc);
@@ -1265,7 +1264,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
}
} else {
/* Old type of message */
- ip_vs_process_message_v0(net, buffer, buflen);
+ ip_vs_process_message_v0(ipvs, buffer, buflen);
return;
}
}
@@ -1303,6 +1302,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
lock_sock(sk);
inet->mc_loop = loop ? 1 : 0;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MULTICAST_LOOP */
+ np->mc_loop = loop ? 1 : 0;
+ }
+#endif
release_sock(sk);
}
@@ -1316,6 +1323,33 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
inet->mc_ttl = ttl;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MULTICAST_HOPS */
+ np->mcast_hops = ttl;
+ }
+#endif
+ release_sock(sk);
+}
+
+/* Control fragmentation of messages */
+static void set_mcast_pmtudisc(struct sock *sk, int val)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
+ lock_sock(sk);
+ inet->pmtudisc = val;
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ /* IPV6_MTU_DISCOVER */
+ np->pmtudisc = val;
+ }
+#endif
release_sock(sk);
}
@@ -1338,44 +1372,15 @@ static int set_mcast_if(struct sock *sk, char *ifname)
lock_sock(sk);
inet->mc_index = dev->ifindex;
/* inet->mc_addr = 0; */
- release_sock(sk);
-
- return 0;
-}
-
+#ifdef CONFIG_IP_VS_IPV6
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
-/*
- * Set the maximum length of sync message according to the
- * specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(struct net *net, int sync_state)
-{
- struct netns_ipvs *ipvs = net_ipvs(net);
- struct net_device *dev;
- int num;
-
- if (sync_state == IP_VS_STATE_MASTER) {
- dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
- if (!dev)
- return -ENODEV;
-
- num = (dev->mtu - sizeof(struct iphdr) -
- sizeof(struct udphdr) -
- SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
- SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
- IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", ipvs->send_mesg_maxlen);
- } else if (sync_state == IP_VS_STATE_BACKUP) {
- dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
- if (!dev)
- return -ENODEV;
-
- ipvs->recv_mesg_maxlen = dev->mtu -
- sizeof(struct iphdr) - sizeof(struct udphdr);
- IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", ipvs->recv_mesg_maxlen);
+ /* IPV6_MULTICAST_IF */
+ np->mcast_oif = dev->ifindex;
}
+#endif
+ release_sock(sk);
return 0;
}
@@ -1405,15 +1410,34 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
mreq.imr_ifindex = dev->ifindex;
- rtnl_lock();
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
- rtnl_unlock();
return ret;
}
+#ifdef CONFIG_IP_VS_IPV6
+static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
+ char *ifname)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
+ int ret;
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
+ release_sock(sk);
+
+ return ret;
+}
+#endif
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
@@ -1442,53 +1466,69 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
}
+static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
+ struct ipvs_sync_daemon_cfg *c, int id)
+{
+ if (AF_INET6 == c->mcast_af) {
+ sa->in6 = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(c->mcast_port + id),
+ };
+ sa->in6.sin6_addr = c->mcast_group.in6;
+ *salen = sizeof(sa->in6);
+ } else {
+ sa->in = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_port = htons(c->mcast_port + id),
+ };
+ sa->in.sin_addr = c->mcast_group.in;
+ *salen = sizeof(sa->in);
+ }
+}
+
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct net *net, int id)
+static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
/* multicast addr */
- struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
- };
+ union ipvs_sockaddr mcast_addr;
struct socket *sock;
- int result;
+ int result, salen;
- /* First create a socket move it to right name space later */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ /* First create a socket */
+ result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
+ IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- /*
- * Kernel sockets that are a part of a namespace, should not
- * hold a reference to a namespace in order to allow to stop it.
- * After sk_change_net should be released using sk_release_kernel.
- */
- sk_change_net(sock->sk, net);
- result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+ result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
}
set_mcast_loop(sock->sk, 0);
- set_mcast_ttl(sock->sk, 1);
+ set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
+ /* Allow fragmentation if MTU changes */
+ set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
result = sysctl_sync_sock_size(ipvs);
if (result > 0)
set_sock_size(sock->sk, 1, result);
- result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+ if (AF_INET == ipvs->mcfg.mcast_af)
+ result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+ else
+ result = 0;
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
}
+ get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
- sizeof(struct sockaddr), 0);
+ salen, 0);
if (result < 0) {
pr_err("Error connecting to the multicast addr\n");
goto error;
@@ -1497,7 +1537,7 @@ static struct socket *make_send_sock(struct net *net, int id)
return sock;
error:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
return ERR_PTR(result);
}
@@ -1505,47 +1545,42 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct net *net, int id)
+static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
/* multicast addr */
- struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
- };
+ union ipvs_sockaddr mcast_addr;
struct socket *sock;
- int result;
+ int result, salen;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
+ IPPROTO_UDP, &sock);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- /*
- * Kernel sockets that are a part of a namespace, should not
- * hold a reference to a namespace in order to allow to stop it.
- * After sk_change_net should be released using sk_release_kernel.
- */
- sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = SK_CAN_REUSE;
result = sysctl_sync_sock_size(ipvs);
if (result > 0)
set_sock_size(sock->sk, 0, result);
- result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
- sizeof(struct sockaddr));
+ get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
+ result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
goto error;
}
/* join the multicast group */
- result = join_mcast_group(sock->sk,
- (struct in_addr *) &mcast_addr.sin_addr,
- ipvs->backup_mcast_ifn);
+#ifdef CONFIG_IP_VS_IPV6
+ if (ipvs->bcfg.mcast_af == AF_INET6)
+ result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
+ ipvs->bcfg.mcast_ifn);
+ else
+#endif
+ result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
+ ipvs->bcfg.mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -1554,7 +1589,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
return sock;
error:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
return ERR_PTR(result);
}
@@ -1646,14 +1681,14 @@ next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
- struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct netns_ipvs *ipvs = tinfo->ipvs;
struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d, id = %d\n",
- ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+ ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
for (;;) {
sb = next_sync_buff(ipvs, ms);
@@ -1692,7 +1727,7 @@ done:
ip_vs_sync_buff_release(sb);
/* release the sending multicast socket */
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo);
return 0;
@@ -1702,12 +1737,12 @@ done:
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
- struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct netns_ipvs *ipvs = tinfo->ipvs;
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d, id = %d\n",
- ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+ ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1717,19 +1752,19 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- ipvs->recv_mesg_maxlen);
+ ipvs->bcfg.sync_maxlen);
if (len <= 0) {
if (len != -EAGAIN)
pr_err("receiving message error\n");
break;
}
- ip_vs_process_message(tinfo->net, tinfo->buf, len);
+ ip_vs_process_message(ipvs, tinfo->buf, len);
}
}
/* release the sending multicast socket */
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
@@ -1737,16 +1772,18 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
+ int state)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **array = NULL, *task;
struct socket *sock;
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net_device *dev;
char *name;
int (*threadfn)(void *data);
- int id, count;
+ int id, count, hlen;
int result = -ENOMEM;
+ u16 mtu, min_mtu;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
@@ -1758,22 +1795,46 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
} else
count = ipvs->threads_mask + 1;
+ if (c->mcast_af == AF_UNSPEC) {
+ c->mcast_af = AF_INET;
+ c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
+ }
+ if (!c->mcast_port)
+ c->mcast_port = IP_VS_SYNC_PORT;
+ if (!c->mcast_ttl)
+ c->mcast_ttl = 1;
+
+ dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
+ if (!dev) {
+ pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
+ return -ENODEV;
+ }
+ hlen = (AF_INET6 == c->mcast_af) ?
+ sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
+ sizeof(struct iphdr) + sizeof(struct udphdr);
+ mtu = (state == IP_VS_STATE_BACKUP) ?
+ clamp(dev->mtu, 1500U, 65535U) : 1500U;
+ min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
+
+ if (c->sync_maxlen)
+ c->sync_maxlen = clamp_t(unsigned int,
+ c->sync_maxlen, min_mtu,
+ 65535 - hlen);
+ else
+ c->sync_maxlen = mtu - hlen;
+
if (state == IP_VS_STATE_MASTER) {
if (ipvs->ms)
return -EEXIST;
- strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
- sizeof(ipvs->master_mcast_ifn));
- ipvs->master_syncid = syncid;
+ ipvs->mcfg = *c;
name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
if (ipvs->backup_threads)
return -EEXIST;
- strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
- sizeof(ipvs->backup_mcast_ifn));
- ipvs->backup_syncid = syncid;
+ ipvs->bcfg = *c;
name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
} else {
@@ -1801,14 +1862,13 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
if (!array)
goto out;
}
- set_sync_mesg_maxlen(net, state);
tinfo = NULL;
for (id = 0; id < count; id++) {
if (state == IP_VS_STATE_MASTER)
- sock = make_send_sock(net, id);
+ sock = make_send_sock(ipvs, id);
else
- sock = make_receive_sock(net, id);
+ sock = make_receive_sock(ipvs, id);
if (IS_ERR(sock)) {
result = PTR_ERR(sock);
goto outtinfo;
@@ -1816,10 +1876,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
goto outsocket;
- tinfo->net = net;
+ tinfo->ipvs = ipvs;
tinfo->sock = sock;
if (state == IP_VS_STATE_BACKUP) {
- tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
goto outtinfo;
@@ -1854,11 +1914,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
return 0;
outsocket:
- sk_release_kernel(sock->sk);
+ sock_release(sock);
outtinfo:
if (tinfo) {
- sk_release_kernel(tinfo->sock->sk);
+ sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
}
@@ -1880,9 +1940,8 @@ out:
}
-int stop_sync_thread(struct net *net, int state)
+int stop_sync_thread(struct netns_ipvs *ipvs, int state)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct task_struct **array;
int id;
int retc = -EINVAL;
@@ -1948,27 +2007,24 @@ int stop_sync_thread(struct net *net, int state)
/*
* Initialize data struct for each netns
*/
-int __net_init ip_vs_sync_net_init(struct net *net)
+int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
__mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
spin_lock_init(&ipvs->sync_lock);
spin_lock_init(&ipvs->sync_buff_lock);
return 0;
}
-void ip_vs_sync_net_cleanup(struct net *net)
+void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
{
int retc;
- struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
- retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Master Daemon\n");
- retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Backup Daemon\n");
mutex_unlock(&ipvs->sync_mutex);
diff --git a/kernel/net/netfilter/ipvs/ip_vs_xmit.c b/kernel/net/netfilter/ipvs/ip_vs_xmit.c
index 19986ec5f..3264cb49b 100644
--- a/kernel/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/kernel/net/netfilter/ipvs/ip_vs_xmit.c
@@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
- fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
FLOWI_FLAG_KNOWN_NH : 0;
@@ -213,19 +212,20 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
-static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
+static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
+ int rt_mode,
struct ip_vs_iphdr *ipvsh,
struct sk_buff *skb, int mtu)
{
#ifdef CONFIG_IP_VS_IPV6
if (skb_af == AF_INET6) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
if (!skb->dev)
skb->dev = net->loopback_dev;
/* only send ICMP too big on first fragment */
- if (!ipvsh->fragoffs)
+ if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG(1, "frag needed for %pI6c\n",
&ipv6_hdr(skb)->saddr);
@@ -234,8 +234,6 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
} else
#endif
{
- struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
-
/* If we're going to tunnel the packet and pmtu discovery
* is disabled, we'll just fragment it anyway
*/
@@ -243,7 +241,8 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
return true;
if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len > mtu && !skb_is_gso(skb))) {
+ skb->len > mtu && !skb_is_gso(skb) &&
+ !ip_vs_iph_icmp(ipvsh))) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
IP_VS_DBG(1, "frag needed for %pI4\n",
@@ -257,11 +256,12 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
/* Get route to destination or remote server */
static int
-__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ struct ip_vs_dest *dest,
__be32 daddr, int rt_mode, __be32 *ret_saddr,
struct ip_vs_iphdr *ipvsh)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
int mtu;
@@ -337,7 +337,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
maybe_update_pmtu(skb_af, skb, mtu);
}
- if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
goto err_put;
skb_dst_drop(skb);
@@ -364,13 +364,16 @@ err_unreach:
#ifdef CONFIG_IP_VS_IPV6
static struct dst_entry *
__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
- struct in6_addr *ret_saddr, int do_xfrm)
+ struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
{
struct dst_entry *dst;
struct flowi6 fl6 = {
.daddr = *daddr,
};
+ if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
+ fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
+
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error)
goto out_err;
@@ -400,11 +403,12 @@ out_err:
* Get route to destination or remote server
*/
static int
-__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = ipvs->net;
struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct dst_entry *dst;
@@ -427,7 +431,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
}
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
&dest_dst->dst_saddr.in6,
- do_xfrm);
+ do_xfrm, rt_mode);
if (!dst) {
__ip_vs_dst_set(dest, NULL, NULL, 0);
spin_unlock_bh(&dest->dst_lock);
@@ -435,7 +439,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
goto err_unreach;
}
rt = (struct rt6_info *) dst;
- cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
@@ -446,7 +450,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
*ret_saddr = dest_dst->dst_saddr.in6;
} else {
noref = 0;
- dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
+ rt_mode);
if (!dst)
goto err_unreach;
rt = (struct rt6_info *) dst;
@@ -481,7 +486,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
maybe_update_pmtu(skb_af, skb, mtu);
}
- if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
goto err_put;
skb_dst_drop(skb);
@@ -501,6 +506,13 @@ err_put:
return -1;
err_unreach:
+ /* The ip6_link_failure function requires the dev field to be set
+ * in order to get the net (further for the sake of fwmark
+ * reflection).
+ */
+ if (!skb->dev)
+ skb->dev = skb_dst(skb)->dev;
+
dst_link_failure(skb);
return -1;
}
@@ -519,10 +531,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset(skb);
skb_forward_csum(skb);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
}
return ret;
}
+/* In the event of a remote destination, it's possible that we would have
+ * matches against an old socket (particularly a TIME-WAIT socket). This
+ * causes havoc down the line (ip_local_out et. al. expect regular sockets
+ * and invalid memory accesses will happen) so simply drop the association
+ * in this case.
+*/
+static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
+{
+ /* If dev is set, the packet came from the LOCAL_IN callback and
+ * not from a local TCP socket.
+ */
+ if (skb->dev)
+ skb_orphan(skb);
+}
+
/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
@@ -534,12 +563,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 1);
+
+ /* Remove the early_demux association unless it's bound for the
+ * exact same port and address on this host after translation.
+ */
+ if (!local || cp->vport != cp->dport ||
+ !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
+ ip_vs_drop_early_demux_sk(skb);
+
if (!local) {
skb_forward_csum(skb);
- NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
- NULL, skb_dst(skb)->dev, dst_output_sk);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output);
} else
ret = NF_ACCEPT;
+
return ret;
}
@@ -553,9 +593,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
if (!local) {
+ ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
- NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
- NULL, skb_dst(skb)->dev, dst_output_sk);
+ if (!skb->sk)
+ skb_sender_cpu_clear(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output);
} else
ret = NF_ACCEPT;
return ret;
@@ -588,7 +631,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr,
+ if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
goto tx_error;
@@ -615,10 +658,13 @@ int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
EnterFunction(10);
rcu_read_lock();
- if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL,
+ if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
+ &iph->daddr, NULL,
ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
@@ -665,7 +711,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
was_input = rt_is_input_route(skb_rtable(skb));
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR, NULL, ipvsh);
@@ -682,7 +728,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
- IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
goto tx_error;
@@ -692,8 +738,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
- IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
- "stopping DNAT to loopback address");
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
+ "ip_vs_nat_xmit(): stopping DNAT to loopback "
+ "address");
goto tx_error;
}
@@ -710,7 +757,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -753,7 +800,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
NULL, ipvsh, 0,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -771,7 +819,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
- IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
goto tx_error;
@@ -781,8 +829,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
- IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
goto tx_error;
@@ -800,7 +848,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
ipv6_hdr(skb)->daddr = cp->daddr.in6;
- IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -841,6 +889,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
struct ipv6hdr *old_ipv6h = NULL;
#endif
+ ip_vs_drop_early_demux_sk(skb);
+
if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb)
@@ -924,8 +974,8 @@ int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct net *net = skb_net(skb);
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs = cp->ipvs;
+ struct net *net = ipvs->net;
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
@@ -941,7 +991,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_CONNECT |
@@ -999,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip_local_out(skb);
+ ip_local_out(net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
@@ -1035,7 +1085,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
&saddr, ipvsh, 1,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1090,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip6_local_out(skb);
+ ip6_local_out(cp->ipvs->net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
@@ -1122,7 +1173,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
@@ -1161,10 +1212,12 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6,
NULL, ipvsh, 0,
IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL);
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH);
if (local < 0)
goto tx_error;
if (local) {
@@ -1229,7 +1282,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
rcu_read_lock();
- local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
+ local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
NULL, iph);
if (local < 0)
goto tx_error;
@@ -1321,8 +1374,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
rcu_read_lock();
- local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
- NULL, ipvsh, 0, rt_mode);
+ local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
if (local < 0)
goto tx_error;
rt = (struct rt6_info *) skb_dst(skb);
@@ -1346,7 +1399,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* From world but DNAT to loopback address? */
if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
diff --git a/kernel/net/netfilter/nf_conntrack_core.c b/kernel/net/netfilter/nf_conntrack_core.c
index 13fad8668..3cb3cb831 100644
--- a/kernel/net/netfilter/nf_conntrack_core.c
+++ b/kernel/net/netfilter/nf_conntrack_core.c
@@ -126,7 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
unsigned int nf_conntrack_hash_rnd __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
-static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple)
{
unsigned int n;
@@ -135,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
* three bytes manually.
*/
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
}
@@ -151,15 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net)
}
static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size)
+ unsigned int size)
{
- return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+ return __hash_bucket(hash_conntrack_raw(tuple), size);
}
-static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+static inline u_int32_t hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size);
+ return __hash_conntrack(tuple, net->ct.htable_size);
}
bool
@@ -168,6 +168,7 @@ nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
+ struct net *net,
struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
@@ -181,12 +182,13 @@ nf_ct_get_tuple(const struct sk_buff *skb,
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- return l4proto->pkt_to_tuple(skb, dataoff, tuple);
+ return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
}
EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
- u_int16_t l3num, struct nf_conntrack_tuple *tuple)
+ u_int16_t l3num,
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -205,7 +207,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
l4proto = __nf_ct_l4proto_find(l3num, protonum);
- ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
+ ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
l3proto, l4proto);
rcu_read_unlock();
@@ -287,6 +289,40 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
spin_unlock(&pcpu->lock);
}
+/* Released via destroy_conntrack() */
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
+ gfp_t flags)
+{
+ struct nf_conn *tmpl;
+
+ tmpl = kzalloc(sizeof(*tmpl), flags);
+ if (tmpl == NULL)
+ return NULL;
+
+ tmpl->status = IPS_TEMPLATE;
+ write_pnet(&tmpl->ct_net, net);
+
+ if (nf_ct_zone_add(tmpl, flags, zone) < 0)
+ goto out_free;
+
+ atomic_set(&tmpl->ct_general.use, 0);
+
+ return tmpl;
+out_free:
+ kfree(tmpl);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
+
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
+{
+ nf_ct_ext_destroy(tmpl);
+ nf_ct_ext_free(tmpl);
+ kfree(tmpl);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
+
static void
destroy_conntrack(struct nf_conntrack *nfct)
{
@@ -298,6 +334,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
+ if (unlikely(nf_ct_is_template(ct))) {
+ nf_ct_tmpl_free(ct);
+ return;
+ }
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto && l4proto->destroy)
@@ -329,7 +369,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
- u16 zone = nf_ct_zone(ct);
unsigned int sequence;
nf_ct_helper_destroy(ct);
@@ -337,9 +376,9 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_conntrack(net, zone,
+ hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -387,8 +426,8 @@ static void death_by_timeout(unsigned long ul_conntrack)
static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
- const struct nf_conntrack_tuple *tuple,
- u16 zone)
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -396,8 +435,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
* so we need to check that the conntrack is confirmed
*/
return nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(ct) == zone &&
- nf_ct_is_confirmed(ct);
+ nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
+ nf_ct_is_confirmed(ct);
}
/*
@@ -406,7 +445,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
*/
static struct nf_conntrack_tuple_hash *
-____nf_conntrack_find(struct net *net, u16 zone,
+____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
@@ -442,7 +481,7 @@ begin:
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
-__nf_conntrack_find_get(struct net *net, u16 zone,
+__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
@@ -469,11 +508,11 @@ begin:
}
struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
+nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, zone));
+ hash_conntrack_raw(tuple));
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -492,11 +531,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- u16 zone;
unsigned int sequence;
zone = nf_ct_zone(ct);
@@ -504,9 +543,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
- hash = hash_conntrack(net, zone,
+ hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -514,12 +553,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
add_timer(&ct->timeout);
@@ -540,32 +581,11 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-/* deletion from this larval template list happens via nf_ct_put() */
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
-{
- struct ct_pcpu *pcpu;
-
- __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
- __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
- nf_conntrack_get(&tmpl->ct_general);
-
- /* add this conntrack to the (per cpu) tmpl list */
- local_bh_disable();
- tmpl->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
-
- spin_lock(&pcpu->lock);
- /* Overload tuple linked list to put us in template list. */
- hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->tmpl);
- spin_unlock_bh(&pcpu->lock);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
-
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
+ const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -574,7 +594,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- u16 zone;
unsigned int sequence;
ct = nf_ct_get(skb, &ctinfo);
@@ -595,7 +614,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = hash_bucket(hash, net);
- reply_hash = hash_conntrack(net, zone,
+ reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
@@ -627,12 +646,14 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
&h->tuple) &&
- zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone,
+ NF_CT_DIRECTION(h)))
goto out;
/* Timer relative to confirmation time, not original
@@ -685,11 +706,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
{
struct net *net = nf_ct_net(ignored_conntrack);
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nf_conn *ct;
- u16 zone = nf_ct_zone(ignored_conntrack);
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int hash;
+
+ zone = nf_ct_zone(ignored_conntrack);
+ hash = hash_conntrack(net, tuple);
/* Disable BHs the entire time since we need to disable them at
* least once for the stats anyway.
@@ -699,7 +723,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct != ignored_conntrack &&
nf_ct_tuple_equal(tuple, &h->tuple) &&
- nf_ct_zone(ct) == zone) {
+ nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) {
NF_CT_STAT_INC(net, found);
rcu_read_unlock_bh();
return 1;
@@ -788,7 +812,8 @@ void init_nf_conntrack_hash_rnd(void)
}
static struct nf_conn *
-__nf_conntrack_alloc(struct net *net, u16 zone,
+__nf_conntrack_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
@@ -798,7 +823,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
if (unlikely(!nf_conntrack_hash_rnd)) {
init_nf_conntrack_hash_rnd();
/* recompute the hash as nf_conntrack_hash_rnd is initialized */
- hash = hash_conntrack_raw(orig, zone);
+ hash = hash_conntrack_raw(orig);
}
/* We don't want any race condition at early drop stage */
@@ -818,10 +843,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
* SLAB_DESTROY_BY_RCU.
*/
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
- if (ct == NULL) {
- atomic_dec(&net->ct.count);
- return ERR_PTR(-ENOMEM);
- }
+ if (ct == NULL)
+ goto out;
+
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -835,31 +859,24 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) -
offsetof(struct nf_conn, __nfct_init_offset[0]));
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- if (zone) {
- struct nf_conntrack_zone *nf_ct_zone;
- nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
- if (!nf_ct_zone)
- goto out_free;
- nf_ct_zone->id = zone;
- }
-#endif
+ if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0)
+ goto out_free;
+
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
atomic_set(&ct->ct_general.use, 0);
return ct;
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
out_free:
- atomic_dec(&net->ct.count);
kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+out:
+ atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
-#endif
}
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+struct nf_conn *nf_conntrack_alloc(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
gfp_t gfp)
@@ -901,8 +918,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp = NULL;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
+ struct nf_conntrack_zone tmp;
unsigned int *timeouts;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
@@ -910,6 +928,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
@@ -921,10 +940,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
- if (timeout_ext)
- timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
- else
+ if (timeout_ext) {
+ timeouts = nf_ct_timeout_data(timeout_ext);
+ if (unlikely(!timeouts))
+ timeouts = l4proto->get_timeouts(net);
+ } else {
timeouts = l4proto->get_timeouts(net);
+ }
if (!l4proto->new(ct, skb, dataoff, timeouts)) {
nf_conntrack_free(ct);
@@ -933,7 +955,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
if (timeout_ext)
- nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
+ nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
+ GFP_ATOMIC);
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
@@ -1004,21 +1027,23 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
int *set_reply,
enum ip_conntrack_info *ctinfo)
{
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_zone tmp;
struct nf_conn *ct;
- u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
- dataoff, l3num, protonum, &tuple, l3proto,
+ dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
pr_debug("resolve_normal_ct: Can't get tuple\n");
return NULL;
}
/* look for tuple match */
- hash = hash_conntrack_raw(&tuple, zone);
+ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
+ hash = hash_conntrack_raw(&tuple);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
@@ -1522,10 +1547,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
- if (!hash) {
- printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+ if (!hash)
hash = vzalloc(sz);
- }
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -1576,8 +1599,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
- bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize);
+ bucket = __hash_conntrack(&h->tuple, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
@@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net)
spin_lock_init(&pcpu->lock);
INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
}
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
diff --git a/kernel/net/netfilter/nf_conntrack_expect.c b/kernel/net/netfilter/nf_conntrack_expect.c
index 7a17070c5..acf5c7b3f 100644
--- a/kernel/net/netfilter/nf_conntrack_expect.c
+++ b/kernel/net/netfilter/nf_conntrack_expect.c
@@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple
}
struct nf_conntrack_expect *
-__nf_ct_expect_find(struct net *net, u16 zone,
+__nf_ct_expect_find(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i;
@@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone,
h = nf_ct_expect_dst_hash(tuple);
hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone(i->master) == zone)
+ nf_ct_zone_equal_any(i->master, zone))
return i;
}
return NULL;
@@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
/* Just find a expectation corresponding to a tuple. */
struct nf_conntrack_expect *
-nf_ct_expect_find_get(struct net *net, u16 zone,
+nf_ct_expect_find_get(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i;
@@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
/* If an expectation for this connection is found, it gets delete from
* global list then returned. */
struct nf_conntrack_expect *
-nf_ct_find_expectation(struct net *net, u16 zone,
+nf_ct_find_expectation(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conntrack_expect *i, *exp = NULL;
@@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
- nf_ct_zone(i->master) == zone) {
+ nf_ct_zone_equal_any(i->master, zone)) {
exp = i;
break;
}
@@ -219,16 +222,17 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
}
- return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+ return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+ nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
static inline int expect_matches(const struct nf_conntrack_expect *a,
const struct nf_conntrack_expect *b)
{
return a->master == b->master && a->class == b->class &&
- nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
- nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
- nf_ct_zone(a->master) == nf_ct_zone(b->master);
+ nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+ nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
/* Generally a bad idea to call this: could have matched already. */
diff --git a/kernel/net/netfilter/nf_conntrack_h323_main.c b/kernel/net/netfilter/nf_conntrack_h323_main.c
index 1d69f5b97..9511af04d 100644
--- a/kernel/net/netfilter/nf_conntrack_h323_main.c
+++ b/kernel/net/netfilter/nf_conntrack_h323_main.c
@@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net,
flowi6_to_flowi(&fl1), false)) {
if (!afinfo->route(net, (struct dst_entry **)&rt2,
flowi6_to_flowi(&fl2), false)) {
- if (ipv6_addr_equal(rt6_nexthop(rt1),
- rt6_nexthop(rt2)) &&
+ if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
+ rt6_nexthop(rt2, &fl2.daddr)) &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
dst_release(&rt2->dst);
diff --git a/kernel/net/netfilter/nf_conntrack_labels.c b/kernel/net/netfilter/nf_conntrack_labels.c
index bb53f120e..3ce5c314e 100644
--- a/kernel/net/netfilter/nf_conntrack_labels.c
+++ b/kernel/net/netfilter/nf_conntrack_labels.c
@@ -14,6 +14,8 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
+static spinlock_t nf_connlabels_lock;
+
static unsigned int label_bits(const struct nf_conn_labels *l)
{
unsigned int longs = l->words;
@@ -48,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
}
EXPORT_SYMBOL_GPL(nf_connlabel_set);
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static void replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -89,7 +90,35 @@ int nf_connlabels_replace(struct nf_conn *ct,
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-#endif
+
+int nf_connlabels_get(struct net *net, unsigned int n_bits)
+{
+ size_t words;
+
+ if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+ return -ERANGE;
+
+ words = BITS_TO_LONGS(n_bits);
+
+ spin_lock(&nf_connlabels_lock);
+ net->ct.labels_used++;
+ if (words > net->ct.label_words)
+ net->ct.label_words = words;
+ spin_unlock(&nf_connlabels_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_get);
+
+void nf_connlabels_put(struct net *net)
+{
+ spin_lock(&nf_connlabels_lock);
+ net->ct.labels_used--;
+ if (net->ct.labels_used == 0)
+ net->ct.label_words = 0;
+ spin_unlock(&nf_connlabels_lock);
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
static struct nf_ct_ext_type labels_extend __read_mostly = {
.len = sizeof(struct nf_conn_labels),
@@ -99,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
int nf_conntrack_labels_init(void)
{
+ spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
}
diff --git a/kernel/net/netfilter/nf_conntrack_netlink.c b/kernel/net/netfilter/nf_conntrack_netlink.c
index d1c23940a..9f5272968 100644
--- a/kernel/net/netfilter/nf_conntrack_netlink.c
+++ b/kernel/net/netfilter/nf_conntrack_netlink.c
@@ -128,6 +128,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb,
}
static inline int
+ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
+ const struct nf_conntrack_zone *zone, int dir)
+{
+ if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
+ return 0;
+ if (nla_put_be16(skb, attrtype, htons(zone->id)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
@@ -458,6 +472,7 @@ static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
@@ -473,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -485,10 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct) &&
- nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0 ||
@@ -598,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
- + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
@@ -609,6 +632,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
+ const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
@@ -655,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
nfmsg->res_id = 0;
rcu_read_lock();
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -667,10 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct) &&
- nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
@@ -920,15 +952,54 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr,
return ret;
}
+static int
+ctnetlink_parse_zone(const struct nlattr *attr,
+ struct nf_conntrack_zone *zone)
+{
+ nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID,
+ NF_CT_DEFAULT_ZONE_DIR, 0);
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (attr)
+ zone->id = ntohs(nla_get_be16(attr));
+#else
+ if (attr)
+ return -EOPNOTSUPP;
+#endif
+ return 0;
+}
+
+static int
+ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type,
+ struct nf_conntrack_zone *zone)
+{
+ int ret;
+
+ if (zone->id != NF_CT_DEFAULT_ZONE_ID)
+ return -EINVAL;
+
+ ret = ctnetlink_parse_zone(attr, zone);
+ if (ret < 0)
+ return ret;
+
+ if (type == CTA_TUPLE_REPLY)
+ zone->dir = NF_CT_ZONE_DIR_REPL;
+ else
+ zone->dir = NF_CT_ZONE_DIR_ORIG;
+
+ return 0;
+}
+
static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
[CTA_TUPLE_IP] = { .type = NLA_NESTED },
[CTA_TUPLE_PROTO] = { .type = NLA_NESTED },
+ [CTA_TUPLE_ZONE] = { .type = NLA_U16 },
};
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
- enum ctattr_type type, u_int8_t l3num)
+ enum ctattr_type type, u_int8_t l3num,
+ struct nf_conntrack_zone *zone)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -955,6 +1026,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
if (err < 0)
return err;
+ if (tb[CTA_TUPLE_ZONE]) {
+ if (!zone)
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE],
+ type, zone);
+ if (err < 0)
+ return err;
+ }
+
/* orig and expect tuples get DIR_ORIGINAL */
if (type == CTA_TUPLE_REPLY)
tuple->dst.dir = IP_CT_DIR_REPLY;
@@ -964,21 +1045,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
return 0;
}
-static int
-ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
-{
- if (attr)
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- *zone = ntohs(nla_get_be16(attr));
-#else
- return -EOPNOTSUPP;
-#endif
- else
- *zone = 0;
-
- return 0;
-}
-
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
[CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1058,7 +1124,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nf_conn *ct;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1066,9 +1132,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+ u3, &zone);
else if (cda[CTA_TUPLE_REPLY])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+ u3, &zone);
else {
return ctnetlink_flush_conntrack(net, cda,
NETLINK_CB(skb).portid,
@@ -1078,7 +1146,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -1112,7 +1180,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct sk_buff *skb2 = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -1138,16 +1206,18 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
+ u3, &zone);
else if (cda[CTA_TUPLE_REPLY])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
+ u3, &zone);
else
return -EINVAL;
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -1645,7 +1715,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
}
static struct nf_conn *
-ctnetlink_create_conntrack(struct net *net, u16 zone,
+ctnetlink_create_conntrack(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nlattr * const cda[],
struct nf_conntrack_tuple *otuple,
struct nf_conntrack_tuple *rtuple,
@@ -1761,7 +1832,8 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
struct nf_conntrack_tuple_hash *master_h;
struct nf_conn *master_ct;
- err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER,
+ u3, NULL);
if (err < 0)
goto err2;
@@ -1804,7 +1876,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_conn *ct;
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1812,21 +1884,23 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_TUPLE_ORIG]) {
- err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG,
+ u3, &zone);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_REPLY]) {
- err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY,
+ u3, &zone);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_ORIG])
- h = nf_conntrack_find_get(net, zone, &otuple);
+ h = nf_conntrack_find_get(net, &zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = nf_conntrack_find_get(net, zone, &rtuple);
+ h = nf_conntrack_find_get(net, &zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
@@ -1836,7 +1910,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
return -EINVAL;
- ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+ ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
&rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
@@ -2059,9 +2133,9 @@ ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *mask);
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
static size_t
-ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+ctnetlink_glue_build_size(const struct nf_conn *ct)
{
return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ 3 * nla_total_size(0) /* CTA_TUPLE_IP */
@@ -2082,23 +2156,40 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
- + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
+ ctnetlink_proto_size(ct)
;
}
-static int
-ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
+static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, ctinfo);
+ if (ct && nf_ct_is_untracked(ct))
+ ct = NULL;
+
+ return ct;
+}
+
+static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
+ const struct nf_conntrack_zone *zone;
struct nlattr *nest_parms;
rcu_read_lock();
+ zone = nf_ct_zone(ct);
+
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_ORIG) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
@@ -2106,12 +2197,14 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
+ if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
+ NF_CT_ZONE_DIR_REPL) < 0)
+ goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- if (nf_ct_zone(ct)) {
- if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
- goto nla_put_failure;
- }
+ if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
+ NF_CT_DEFAULT_ZONE_DIR) < 0)
+ goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
goto nla_put_failure;
@@ -2154,7 +2247,32 @@ nla_put_failure:
}
static int
-ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
+ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ u_int16_t ct_attr, u_int16_t ct_info_attr)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (__ctnetlink_glue_build(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int
+ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
{
int err;
@@ -2194,7 +2312,7 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
}
static int
-ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
+ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct)
{
struct nlattr *cda[CTA_MAX+1];
int ret;
@@ -2204,31 +2322,31 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
return ret;
spin_lock_bh(&nf_conntrack_expect_lock);
- ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
+ ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct);
spin_unlock_bh(&nf_conntrack_expect_lock);
return ret;
}
-static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
- const struct nf_conn *ct,
- struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_tuple *mask)
+static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda,
+ const struct nf_conn *ct,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
{
int err;
err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
- nf_ct_l3num(ct));
+ nf_ct_l3num(ct), NULL);
if (err < 0)
return err;
return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
- nf_ct_l3num(ct));
+ nf_ct_l3num(ct), NULL);
}
static int
-ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
- u32 portid, u32 report)
+ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+ u32 portid, u32 report)
{
struct nlattr *cda[CTA_EXPECT_MAX+1];
struct nf_conntrack_tuple tuple, mask;
@@ -2240,8 +2358,8 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
if (err < 0)
return err;
- err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
- ct, &tuple, &mask);
+ err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda,
+ ct, &tuple, &mask);
if (err < 0)
return err;
@@ -2268,14 +2386,24 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
return 0;
}
-static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
- .build_size = ctnetlink_nfqueue_build_size,
- .build = ctnetlink_nfqueue_build,
- .parse = ctnetlink_nfqueue_parse,
- .attach_expect = ctnetlink_nfqueue_attach_expect,
- .seq_adjust = nf_ct_tcp_seqadj_set,
+static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int diff)
+{
+ if (!(ct->status & IPS_NAT_MASK))
+ return;
+
+ nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
+}
+
+static struct nfnl_ct_hook ctnetlink_glue_hook = {
+ .get_ct = ctnetlink_glue_get_ct,
+ .build_size = ctnetlink_glue_build_size,
+ .build = ctnetlink_glue_build,
+ .parse = ctnetlink_glue_parse,
+ .attach_expect = ctnetlink_glue_attach_expect,
+ .seq_adjust = ctnetlink_glue_seqadj,
};
-#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
+#endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */
/***********************************************************************
* EXPECT
@@ -2612,23 +2740,22 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- u16 zone = 0;
+ struct nf_conntrack_zone zone;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
.done = ctnetlink_exp_done,
};
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
if (err < 0)
return err;
- if (cda[CTA_EXPECT_ZONE]) {
- err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
- if (err < 0)
- return err;
- }
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
- h = nf_conntrack_find_get(net, zone, &tuple);
+ h = nf_conntrack_find_get(net, &zone, &tuple);
if (!h)
return -ENOENT;
@@ -2652,7 +2779,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
struct sk_buff *skb2;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -2672,16 +2799,18 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
return err;
if (cda[CTA_EXPECT_TUPLE])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
else if (cda[CTA_EXPECT_MASTER])
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
else
return -EINVAL;
if (err < 0)
return err;
- exp = nf_ct_expect_find_get(net, zone, &tuple);
+ exp = nf_ct_expect_find_get(net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -2732,8 +2861,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct hlist_node *next;
u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_zone zone;
unsigned int i;
- u16 zone;
int err;
if (cda[CTA_EXPECT_TUPLE]) {
@@ -2742,12 +2871,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
/* bump usage count to 2 */
- exp = nf_ct_expect_find_get(net, zone, &tuple);
+ exp = nf_ct_expect_find_get(net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -2849,7 +2979,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
return -EINVAL;
err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
- &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+ &nat_tuple, CTA_EXPECT_NAT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
@@ -2937,7 +3068,8 @@ err_out:
}
static int
-ctnetlink_create_expect(struct net *net, u16 zone,
+ctnetlink_create_expect(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nlattr * const cda[],
u_int8_t u3, u32 portid, int report)
{
@@ -2949,13 +3081,16 @@ ctnetlink_create_expect(struct net *net, u16 zone,
int err;
/* caller guarantees that those three CTA_EXPECT_* exist */
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK,
+ u3, NULL);
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER,
+ u3, NULL);
if (err < 0)
return err;
@@ -2995,11 +3130,6 @@ ctnetlink_create_expect(struct net *net, u16 zone,
}
err = nf_ct_expect_related_report(exp, portid, report);
- if (err < 0)
- goto err_exp;
-
- return 0;
-err_exp:
nf_ct_expect_put(exp);
err_ct:
nf_ct_put(ct);
@@ -3016,7 +3146,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
struct nf_conntrack_expect *exp;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
- u16 zone;
+ struct nf_conntrack_zone zone;
int err;
if (!cda[CTA_EXPECT_TUPLE]
@@ -3028,19 +3158,18 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
if (err < 0)
return err;
- err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
+ u3, NULL);
if (err < 0)
return err;
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = __nf_ct_expect_find(net, zone, &tuple);
-
+ exp = __nf_ct_expect_find(net, &zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
- err = ctnetlink_create_expect(net, zone, cda,
- u3,
+ err = ctnetlink_create_expect(net, &zone, cda, u3,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
}
@@ -3258,9 +3387,9 @@ static int __init ctnetlink_init(void)
pr_err("ctnetlink_init: cannot register pernet operations\n");
goto err_unreg_exp_subsys;
}
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
/* setup interaction between nf_queue and nf_conntrack_netlink. */
- RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
+ RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook);
#endif
return 0;
@@ -3279,8 +3408,8 @@ static void __exit ctnetlink_exit(void)
unregister_pernet_subsys(&ctnetlink_net_ops);
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
nfnetlink_subsys_unregister(&ctnl_subsys);
-#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
- RCU_INIT_POINTER(nfq_ct_hook, NULL);
+#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
+ RCU_INIT_POINTER(nfnl_ct_hook, NULL);
#endif
}
diff --git a/kernel/net/netfilter/nf_conntrack_pptp.c b/kernel/net/netfilter/nf_conntrack_pptp.c
index 825c3e3f8..5588c7ae1 100644
--- a/kernel/net/netfilter/nf_conntrack_pptp.c
+++ b/kernel/net/netfilter/nf_conntrack_pptp.c
@@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
const struct nf_conntrack_tuple *t)
{
const struct nf_conntrack_tuple_hash *h;
+ const struct nf_conntrack_zone *zone;
struct nf_conntrack_expect *exp;
struct nf_conn *sibling;
- u16 zone = nf_ct_zone(ct);
pr_debug("trying to timeout ct or exp for tuple ");
nf_ct_dump_tuple(t);
+ zone = nf_ct_zone(ct);
h = nf_conntrack_find_get(net, zone, t);
if (h) {
sibling = nf_ct_tuplehash_to_ctrack(h);
diff --git a/kernel/net/netfilter/nf_conntrack_proto_dccp.c b/kernel/net/netfilter/nf_conntrack_proto_dccp.c
index 6dd995c7c..fce1b1cca 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_dccp.c
@@ -398,7 +398,7 @@ static inline struct dccp_net *dccp_pernet(struct net *net)
}
static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
struct dccp_hdr _hdr, *dh;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_generic.c b/kernel/net/netfilter/nf_conntrack_proto_generic.c
index 60865f110..86dc752e5 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_generic.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_generic.c
@@ -45,7 +45,7 @@ static inline struct nf_generic_net *generic_pernet(struct net *net)
static bool generic_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
tuple->src.u.all = 0;
tuple->dst.u.all = 0;
@@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct,
static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
- return nf_generic_should_process(nf_ct_protonum(ct));
+ bool ret;
+
+ ret = nf_generic_should_process(nf_ct_protonum(ct));
+ if (!ret)
+ pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
+ nf_ct_protonum(ct));
+ return ret;
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
diff --git a/kernel/net/netfilter/nf_conntrack_proto_gre.c b/kernel/net/netfilter/nf_conntrack_proto_gre.c
index 7648674f2..a96451a7a 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_gre.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_gre.c
@@ -190,9 +190,8 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
/* gre hdr info to tuple */
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
- struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
const struct gre_hdr_pptp *pgrehdr;
struct gre_hdr_pptp _pgrehdr;
__be16 srckey;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_sctp.c b/kernel/net/netfilter/nf_conntrack_proto_sctp.c
index b45da90fa..9578a7c37 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_sctp.c
@@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = {
"SHUTDOWN_SENT",
"SHUTDOWN_RECD",
"SHUTDOWN_ACK_SENT",
+ "HEARTBEAT_SENT",
+ "HEARTBEAT_ACKED",
};
#define SECS * HZ
@@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
[SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
+ [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
};
#define sNO SCTP_CONNTRACK_NONE
@@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
+#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
+HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
+HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
+ that of the HEARTBEAT chunk. Secondary connection is
+ established.
*/
/* TODO
@@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
- Check the error type in the reply dir before transitioning from
cookie echoed to closed.
- Sec 5.2.4 of RFC 2960
- - Multi Homing support.
+ - Full Multi Homing support.
*/
/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
}
};
@@ -142,7 +156,7 @@ static inline struct sctp_net *sctp_pernet(struct net *net)
}
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct sctphdr *hp;
struct sctphdr _hdr;
@@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
+ case SCTP_CID_HEARTBEAT:
+ pr_debug("SCTP_CID_HEARTBEAT");
+ i = 9;
+ break;
+ case SCTP_CID_HEARTBEAT_ACK:
+ pr_debug("SCTP_CID_HEARTBEAT_ACK");
+ i = 10;
+ break;
default:
- /* Other chunks like DATA, SACK, HEARTBEAT and
- its ACK do not cause a change in state */
+ /* Other chunks like DATA or SACK do not change the state */
pr_debug("Unknown chunk type, Will stay in %s\n",
sctp_conntrack_names[cur_state]);
return cur_state;
@@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
!test_bit(SCTP_CID_ABORT, map) &&
!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT, map) &&
+ !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
pr_debug("Verification tag check failed\n");
goto out;
@@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct,
/* Sec 8.5.1 (D) */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
+ } else if (sch->type == SCTP_CID_HEARTBEAT ||
+ sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting vtag %x for dir %d\n",
+ sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out_unlock;
+ }
}
old_state = ct->proto.sctp.state;
@@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Sec 8.5.1 (A) */
return false;
}
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ pr_debug("Setting vtag %x for secondary conntrack\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
}
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
@@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
@@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{ }
};
@@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+ pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT];
+ pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED];
#endif
return 0;
}
diff --git a/kernel/net/netfilter/nf_conntrack_proto_tcp.c b/kernel/net/netfilter/nf_conntrack_proto_tcp.c
index 70383de72..278f3b935 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_tcp.c
@@ -277,7 +277,7 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
}
static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct tcphdr *hp;
struct tcphdr _hdr;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_udp.c b/kernel/net/netfilter/nf_conntrack_proto_udp.c
index 6957281ff..478f92f83 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_udp.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_udp.c
@@ -38,6 +38,7 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
static bool udp_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
+ struct net *net,
struct nf_conntrack_tuple *tuple)
{
const struct udphdr *hp;
diff --git a/kernel/net/netfilter/nf_conntrack_proto_udplite.c b/kernel/net/netfilter/nf_conntrack_proto_udplite.c
index c5903d164..1ac8ee13a 100644
--- a/kernel/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/kernel/net/netfilter/nf_conntrack_proto_udplite.c
@@ -48,6 +48,7 @@ static inline struct udplite_net *udplite_pernet(struct net *net)
static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
+ struct net *net,
struct nf_conntrack_tuple *tuple)
{
const struct udphdr *hp;
diff --git a/kernel/net/netfilter/nf_conntrack_seqadj.c b/kernel/net/netfilter/nf_conntrack_seqadj.c
index ce3e840c8..dff0f0cc5 100644
--- a/kernel/net/netfilter/nf_conntrack_seqadj.c
+++ b/kernel/net/netfilter/nf_conntrack_seqadj.c
@@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb,
ntohl(sack->end_seq), ntohl(new_end_seq));
inet_proto_csum_replace4(&tcph->check, skb,
- sack->start_seq, new_start_seq, 0);
+ sack->start_seq, new_start_seq, false);
inet_proto_csum_replace4(&tcph->check, skb,
- sack->end_seq, new_end_seq, 0);
+ sack->end_seq, new_end_seq, false);
sack->start_seq = new_start_seq;
sack->end_seq = new_end_seq;
sackoff += sizeof(*sack);
@@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
newseq = htonl(ntohl(tcph->seq) + seqoff);
newack = htonl(ntohl(tcph->ack_seq) - ackoff);
- inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
- inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
+ false);
pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
diff --git a/kernel/net/netfilter/nf_conntrack_standalone.c b/kernel/net/netfilter/nf_conntrack_standalone.c
index fc823fa5d..1fb3cacc0 100644
--- a/kernel/net/netfilter/nf_conntrack_standalone.c
+++ b/kernel/net/netfilter/nf_conntrack_standalone.c
@@ -140,6 +140,35 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+ int dir)
+{
+ const struct nf_conntrack_zone *zone = nf_ct_zone(ct);
+
+ if (zone->dir != dir)
+ return;
+ switch (zone->dir) {
+ case NF_CT_DEFAULT_ZONE_DIR:
+ seq_printf(s, "zone=%u ", zone->id);
+ break;
+ case NF_CT_ZONE_DIR_ORIG:
+ seq_printf(s, "zone-orig=%u ", zone->id);
+ break;
+ case NF_CT_ZONE_DIR_REPL:
+ seq_printf(s, "zone-reply=%u ", zone->id);
+ break;
+ default:
+ break;
+ }
+}
+#else
+static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct,
+ int dir)
+{
+}
+#endif
+
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
{
@@ -202,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
l3proto, l4proto);
+ ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
+
if (seq_has_overflowed(s))
goto release;
@@ -214,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
l3proto, l4proto);
+ ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
+
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
@@ -228,11 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
#endif
ct_show_secctx(s, ct);
-
-#ifdef CONFIG_NF_CONNTRACK_ZONES
- seq_printf(s, "zone=%u ", nf_ct_zone(ct));
-#endif
-
+ ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
diff --git a/kernel/net/netfilter/nf_internals.h b/kernel/net/netfilter/nf_internals.h
index ea7f36784..065522564 100644
--- a/kernel/net/netfilter/nf_internals.h
+++ b/kernel/net/netfilter/nf_internals.h
@@ -19,6 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
/* nf_queue.c */
int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
struct nf_hook_state *state, unsigned int queuenum);
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops);
int __init netfilter_queue_init(void);
/* nf_log.c */
diff --git a/kernel/net/netfilter/nf_log.c b/kernel/net/netfilter/nf_log.c
index 675d12c69..a5d41dfa9 100644
--- a/kernel/net/netfilter/nf_log.c
+++ b/kernel/net/netfilter/nf_log.c
@@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register);
void nf_log_unregister(struct nf_logger *logger)
{
+ const struct nf_logger *log;
int i;
mutex_lock(&nf_log_mutex);
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
- RCU_INIT_POINTER(loggers[i][logger->type], NULL);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = nft_log_dereference(loggers[i][logger->type]);
+ if (log == logger)
+ RCU_INIT_POINTER(loggers[i][logger->type], NULL);
+ }
mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
}
EXPORT_SYMBOL(nf_log_unregister);
diff --git a/kernel/net/netfilter/nf_nat_core.c b/kernel/net/netfilter/nf_nat_core.c
index 4e0b47831..06a9f4577 100644
--- a/kernel/net/netfilter/nf_nat_core.c
+++ b/kernel/net/netfilter/nf_nat_core.c
@@ -83,7 +83,7 @@ out:
rcu_read_unlock();
}
-int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
+int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
{
struct flowi fl;
unsigned int hh_len;
@@ -99,7 +99,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
dst = ((struct xfrm_dst *)dst)->route;
dst_hold(dst);
- dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -118,14 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+ tuple->dst.protonum ^ nf_conntrack_hash_rnd);
return reciprocal_scale(hash, net->ct.nat_htable_size);
}
@@ -185,20 +184,22 @@ same_src(const struct nf_conn *ct,
/* Only called for SRC manip */
static int
-find_appropriate_src(struct net *net, u16 zone,
+find_appropriate_src(struct net *net,
+ const struct nf_conntrack_zone *zone,
const struct nf_nat_l3proto *l3proto,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
- unsigned int h = hash_by_src(net, zone, tuple);
+ unsigned int h = hash_by_src(net, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
ct = nat->ct;
- if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+ if (same_src(ct, tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -218,7 +219,8 @@ find_appropriate_src(struct net *net, u16 zone,
* the ip with the lowest src-ip/dst-ip/proto usage.
*/
static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+find_best_ips_proto(const struct nf_conntrack_zone *zone,
+ struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
@@ -258,7 +260,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
*/
j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
range->flags & NF_NAT_RANGE_PERSISTENT ?
- 0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+ 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
full_range = false;
for (i = 0; i <= max; i++) {
@@ -297,10 +299,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
+ const struct nf_conntrack_zone *zone;
const struct nf_nat_l3proto *l3proto;
const struct nf_nat_l4proto *l4proto;
struct net *net = nf_ct_net(ct);
- u16 zone = nf_ct_zone(ct);
+
+ zone = nf_ct_zone(ct);
rcu_read_lock();
l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
@@ -420,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct,
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
- srchash = hash_by_src(net, nf_ct_zone(ct),
+ srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate extension aera */
diff --git a/kernel/net/netfilter/nf_nat_proto_dccp.c b/kernel/net/netfilter/nf_nat_proto_dccp.c
index b8067b53f..15c47b246 100644
--- a/kernel/net/netfilter/nf_nat_proto_dccp.c
+++ b/kernel/net/netfilter/nf_nat_proto_dccp.c
@@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb,
l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
tuple, maniptype);
inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- 0);
+ false);
return true;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_tcp.c b/kernel/net/netfilter/nf_nat_proto_tcp.c
index 37f5505f4..4f8820fc5 100644
--- a/kernel/net/netfilter/nf_nat_proto_tcp.c
+++ b/kernel/net/netfilter/nf_nat_proto_tcp.c
@@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb,
return true;
l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
return true;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_udp.c b/kernel/net/netfilter/nf_nat_proto_udp.c
index b0ede2f0d..b1e627227 100644
--- a/kernel/net/netfilter/nf_nat_proto_udp.c
+++ b/kernel/net/netfilter/nf_nat_proto_udp.c
@@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb,
l3proto->csum_update(skb, iphdroff, &hdr->check,
tuple, maniptype);
inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
- 0);
+ false);
if (!hdr->check)
hdr->check = CSUM_MANGLED_0;
}
diff --git a/kernel/net/netfilter/nf_nat_proto_udplite.c b/kernel/net/netfilter/nf_nat_proto_udplite.c
index 368f14e01..58340c97b 100644
--- a/kernel/net/netfilter/nf_nat_proto_udplite.c
+++ b/kernel/net/netfilter/nf_nat_proto_udplite.c
@@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb,
}
l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false);
if (!hdr->check)
hdr->check = CSUM_MANGLED_0;
diff --git a/kernel/net/netfilter/nf_nat_redirect.c b/kernel/net/netfilter/nf_nat_redirect.c
index 97b75f9bf..d43869879 100644
--- a/kernel/net/netfilter/nf_nat_redirect.c
+++ b/kernel/net/netfilter/nf_nat_redirect.c
@@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
- if (indev != NULL) {
+ if (indev && indev->ifa_list) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local;
}
diff --git a/kernel/net/netfilter/nf_queue.c b/kernel/net/netfilter/nf_queue.c
index 2e88032cd..5baa8e24e 100644
--- a/kernel/net/netfilter/nf_queue.c
+++ b/kernel/net/netfilter/nf_queue.c
@@ -69,19 +69,14 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
dev_put(physdev);
}
#endif
- /* Drop reference to owner of hook which queued us. */
- module_put(entry->elem->owner);
}
EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
/* Bump dev refs so they don't vanish while packet is out */
-bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
- if (!try_module_get(entry->elem->owner))
- return false;
-
if (state->in)
dev_hold(state->in);
if (state->out)
@@ -100,11 +95,20 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
dev_hold(physdev);
}
#endif
-
- return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
+void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
+{
+ const struct nf_queue_handler *qh;
+
+ rcu_read_lock();
+ qh = rcu_dereference(queue_handler);
+ if (qh)
+ qh->nf_hook_drop(net, ops);
+ rcu_read_unlock();
+}
+
/*
* Any packet that leaves via this function must come back
* through nf_reinject().
@@ -120,22 +124,20 @@ int nf_queue(struct sk_buff *skb,
const struct nf_queue_handler *qh;
/* QUEUE == DROP if no one is waiting, to be safe. */
- rcu_read_lock();
-
qh = rcu_dereference(queue_handler);
if (!qh) {
status = -ESRCH;
- goto err_unlock;
+ goto err;
}
afinfo = nf_get_afinfo(state->pf);
if (!afinfo)
- goto err_unlock;
+ goto err;
entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
if (!entry) {
status = -ENOMEM;
- goto err_unlock;
+ goto err;
}
*entry = (struct nf_queue_entry) {
@@ -145,16 +147,11 @@ int nf_queue(struct sk_buff *skb,
.size = sizeof(*entry) + afinfo->route_key_size,
};
- if (!nf_queue_entry_get_refs(entry)) {
- status = -ECANCELED;
- goto err_unlock;
- }
+ nf_queue_entry_get_refs(entry);
skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);
- rcu_read_unlock();
-
if (status < 0) {
nf_queue_entry_release_refs(entry);
goto err;
@@ -162,8 +159,6 @@ int nf_queue(struct sk_buff *skb,
return 0;
-err_unlock:
- rcu_read_unlock();
err:
kfree(entry);
return status;
@@ -176,19 +171,15 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
const struct nf_afinfo *afinfo;
int err;
- rcu_read_lock();
-
nf_queue_entry_release_refs(entry);
/* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT) {
- elem = list_entry(elem->list.prev, struct nf_hook_ops, list);
- verdict = NF_ACCEPT;
- }
+ if (verdict == NF_REPEAT)
+ verdict = elem->hook(elem->priv, skb, &entry->state);
if (verdict == NF_ACCEPT) {
afinfo = nf_get_afinfo(entry->state.pf);
- if (!afinfo || afinfo->reroute(skb, entry) < 0)
+ if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0)
verdict = NF_DROP;
}
@@ -196,7 +187,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
if (verdict == NF_ACCEPT) {
next_hook:
- verdict = nf_iterate(&nf_hooks[entry->state.pf][entry->state.hook],
+ verdict = nf_iterate(entry->state.hook_list,
skb, &entry->state, &elem);
}
@@ -204,15 +195,13 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
case NF_ACCEPT:
case NF_STOP:
local_bh_disable();
- entry->state.okfn(entry->state.sk, skb);
+ entry->state.okfn(entry->state.net, entry->state.sk, skb);
local_bh_enable();
break;
case NF_QUEUE:
err = nf_queue(skb, elem, &entry->state,
verdict >> NF_VERDICT_QBITS);
if (err < 0) {
- if (err == -ECANCELED)
- goto next_hook;
if (err == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
goto next_hook;
@@ -224,7 +213,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
default:
kfree_skb(skb);
}
- rcu_read_unlock();
+
kfree(entry);
}
EXPORT_SYMBOL(nf_reinject);
diff --git a/kernel/net/netfilter/nf_synproxy_core.c b/kernel/net/netfilter/nf_synproxy_core.c
index 52e20c9a4..c8a4a48bc 100644
--- a/kernel/net/netfilter/nf_synproxy_core.c
+++ b/kernel/net/netfilter/nf_synproxy_core.c
@@ -11,15 +11,18 @@
#include <asm/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
+#include <linux/proc_fs.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_tcpudp.h>
#include <linux/netfilter/xt_SYNPROXY.h>
+
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_zones.h>
int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -185,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
- u32 *ptr, old;
+ __be32 *ptr, old;
if (synproxy->tsoff == 0)
return 1;
@@ -213,18 +216,18 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
if (op[0] == TCPOPT_TIMESTAMP &&
op[1] == TCPOLEN_TIMESTAMP) {
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
- ptr = (u32 *)&op[2];
+ ptr = (__be32 *)&op[2];
old = *ptr;
*ptr = htonl(ntohl(*ptr) -
synproxy->tsoff);
} else {
- ptr = (u32 *)&op[6];
+ ptr = (__be32 *)&op[6];
old = *ptr;
*ptr = htonl(ntohl(*ptr) +
synproxy->tsoff);
}
inet_proto_csum_replace4(&th->check, skb,
- old, *ptr, 0);
+ old, *ptr, false);
return 1;
}
optoff += op[1];
@@ -348,23 +351,20 @@ static void __net_exit synproxy_proc_exit(struct net *net)
static int __net_init synproxy_net_init(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
- struct nf_conntrack_tuple t;
struct nf_conn *ct;
int err = -ENOMEM;
- memset(&t, 0, sizeof(t));
- ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
- if (IS_ERR(ct)) {
- err = PTR_ERR(ct);
+ ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL);
+ if (!ct)
goto err1;
- }
if (!nfct_seqadj_ext_add(ct))
goto err2;
if (!nfct_synproxy_ext_add(ct))
goto err2;
- nf_conntrack_tmpl_insert(net, ct);
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net)
err3:
free_percpu(snet->stats);
err2:
- nf_conntrack_free(ct);
+ nf_ct_tmpl_free(ct);
err1:
return err;
}
diff --git a/kernel/net/netfilter/nf_tables_api.c b/kernel/net/netfilter/nf_tables_api.c
index 34ded0931..2cb429d34 100644
--- a/kernel/net/netfilter/nf_tables_api.c
+++ b/kernel/net/netfilter/nf_tables_api.c
@@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
}
static void nft_ctx_init(struct nft_ctx *ctx,
+ struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct nft_af_info *afi,
@@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
struct nft_chain *chain,
const struct nlattr * const *nla)
{
- ctx->net = sock_net(skb->sk);
+ ctx->net = net;
ctx->afi = afi;
ctx->table = table;
ctx->chain = chain;
@@ -127,13 +128,50 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
+int nft_register_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
+{
+ struct net *net = read_pnet(&basechain->pnet);
+
+ if (basechain->flags & NFT_BASECHAIN_DISABLED)
+ return 0;
+
+ return nf_register_net_hooks(net, basechain->ops, hook_nops);
+}
+EXPORT_SYMBOL_GPL(nft_register_basechain);
+
+void nft_unregister_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
+{
+ struct net *net = read_pnet(&basechain->pnet);
+
+ if (basechain->flags & NFT_BASECHAIN_DISABLED)
+ return;
+
+ nf_unregister_net_hooks(net, basechain->ops, hook_nops);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_basechain);
+
+static int nf_tables_register_hooks(const struct nft_table *table,
+ struct nft_chain *chain,
+ unsigned int hook_nops)
+{
+ if (table->flags & NFT_TABLE_F_DORMANT ||
+ !(chain->flags & NFT_BASE_CHAIN))
+ return 0;
+
+ return nft_register_basechain(nft_base_chain(chain), hook_nops);
+}
+
static void nf_tables_unregister_hooks(const struct nft_table *table,
- const struct nft_chain *chain,
+ struct nft_chain *chain,
unsigned int hook_nops)
{
- if (!(table->flags & NFT_TABLE_F_DORMANT) &&
- chain->flags & NFT_BASE_CHAIN)
- nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops);
+ if (table->flags & NFT_TABLE_F_DORMANT ||
+ !(chain->flags & NFT_BASE_CHAIN))
+ return;
+
+ nft_unregister_basechain(nft_base_chain(chain), hook_nops);
}
/* Internal table flags */
@@ -560,7 +598,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi,
if (!(chain->flags & NFT_BASE_CHAIN))
continue;
- err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ err = nft_register_basechain(nft_base_chain(chain), afi->nops);
if (err < 0)
goto err;
@@ -575,20 +613,20 @@ err:
if (i-- <= 0)
break;
- nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops);
+ nft_unregister_basechain(nft_base_chain(chain), afi->nops);
}
return err;
}
static void nf_tables_table_disable(const struct nft_af_info *afi,
- struct nft_table *table)
+ struct nft_table *table)
{
struct nft_chain *chain;
list_for_each_entry(chain, &table->chains, list) {
if (chain->flags & NFT_BASE_CHAIN)
- nf_unregister_hooks(nft_base_chain(chain)->ops,
- afi->nops);
+ nft_unregister_basechain(nft_base_chain(chain),
+ afi->nops);
}
}
@@ -635,15 +673,14 @@ err:
return ret;
}
-static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newtable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nlattr *name;
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
u32 flags = 0;
struct nft_ctx ctx;
@@ -669,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
return nf_tables_updtable(&ctx);
}
@@ -679,30 +716,32 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
return -EINVAL;
}
+ err = -EAFNOSUPPORT;
if (!try_module_get(afi->owner))
- return -EAFNOSUPPORT;
+ goto err1;
err = -ENOMEM;
table = kzalloc(sizeof(*table), GFP_KERNEL);
if (table == NULL)
- goto err1;
+ goto err2;
nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
table->flags = flags;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
- goto err2;
+ goto err3;
list_add_tail_rcu(&table->list, &afi->tables);
return 0;
-err2:
+err3:
kfree(table);
-err1:
+err2:
module_put(afi->owner);
+err1:
return err;
}
@@ -771,18 +810,17 @@ out:
return err;
}
-static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_deltable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla);
if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
return nft_flush(&ctx, family);
@@ -881,6 +919,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
[NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 },
[NFTA_HOOK_PRIORITY] = { .type = NLA_U32 },
+ [NFTA_HOOK_DEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
};
static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
@@ -954,6 +994,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
goto nla_put_failure;
+ if (basechain->dev_name[0] &&
+ nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1165,16 +1208,20 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
BUG_ON(chain->use > 0);
if (chain->flags & NFT_BASE_CHAIN) {
- module_put(nft_base_chain(chain)->type->owner);
- free_percpu(nft_base_chain(chain)->stats);
- kfree(nft_base_chain(chain));
+ struct nft_base_chain *basechain = nft_base_chain(chain);
+
+ module_put(basechain->type->owner);
+ free_percpu(basechain->stats);
+ if (basechain->ops[0].dev != NULL)
+ dev_put(basechain->ops[0].dev);
+ kfree(basechain);
} else {
kfree(chain);
}
}
-static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1184,8 +1231,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
struct nlattr *ha[NFTA_HOOK_MAX + 1];
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
u64 handle = 0;
unsigned int i;
@@ -1264,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(stats);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
sizeof(struct nft_trans_chain));
if (trans == NULL) {
@@ -1325,17 +1372,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
return -ENOENT;
hookfn = type->hooks[hooknum];
+ if (afi->flags & NFT_AF_NEEDS_DEV) {
+ char ifname[IFNAMSIZ];
+
+ if (!ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
+ nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
+ dev = dev_get_by_name(net, ifname);
+ if (!dev) {
+ module_put(type->owner);
+ return -ENOENT;
+ }
+ } else if (ha[NFTA_HOOK_DEV]) {
+ module_put(type->owner);
+ return -EOPNOTSUPP;
+ }
+
basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
if (basechain == NULL) {
module_put(type->owner);
+ if (dev != NULL)
+ dev_put(dev);
return -ENOMEM;
}
+ if (dev != NULL)
+ strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
if (IS_ERR(stats)) {
module_put(type->owner);
kfree(basechain);
+ if (dev != NULL)
+ dev_put(dev);
return PTR_ERR(stats);
}
basechain->stats = stats;
@@ -1344,6 +1417,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
if (stats == NULL) {
module_put(type->owner);
kfree(basechain);
+ if (dev != NULL)
+ dev_put(dev);
return -ENOMEM;
}
rcu_assign_pointer(basechain->stats, stats);
@@ -1356,11 +1431,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
for (i = 0; i < afi->nops; i++) {
ops = &basechain->ops[i];
ops->pf = family;
- ops->owner = afi->owner;
ops->hooknum = hooknum;
ops->priority = priority;
ops->priv = chain;
ops->hook = afi->hooks[ops->hooknum];
+ ops->dev = dev;
if (hookfn)
ops->hook = hookfn;
if (afi->hook_ops_init)
@@ -1380,14 +1455,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
chain->table = table;
nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
- if (!(table->flags & NFT_TABLE_F_DORMANT) &&
- chain->flags & NFT_BASE_CHAIN) {
- err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
- if (err < 0)
- goto err1;
- }
+ err = nf_tables_register_hooks(table, chain, afi->nops);
+ if (err < 0)
+ goto err1;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
if (err < 0)
goto err2;
@@ -1402,15 +1474,14 @@ err1:
return err;
}
-static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
@@ -1432,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
if (chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
return nft_delchain(&ctx);
}
@@ -1936,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
static struct nft_expr_info *info;
-static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule, *old_rule = NULL;
@@ -2001,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(old_rule);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
n = 0;
size = 0;
@@ -2102,13 +2172,12 @@ err1:
return err;
}
-static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain = NULL;
struct nft_rule *rule;
@@ -2131,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(chain);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
@@ -2270,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi = NULL;
struct nft_table *table = NULL;
@@ -2297,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
return -ENOENT;
}
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -2549,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
struct sk_buff *skb2;
@@ -2556,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -2619,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
return 0;
}
-static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nft_set_ops *ops;
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
@@ -2724,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (IS_ERR(table))
return PTR_ERR(table);
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
if (IS_ERR(set)) {
@@ -2808,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set
nft_set_destroy(set);
}
-static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2822,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -2950,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2959,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
if (IS_ERR(afi))
@@ -2971,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
if (!trans && (table->flags & NFT_TABLE_INACTIVE))
return -ENOENT;
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -3061,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
@@ -3076,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
return err;
- err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
- false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
+ (void *)nla, false);
if (err < 0)
return err;
@@ -3138,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
int err;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
@@ -3454,11 +3523,10 @@ err1:
return err;
}
-static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
@@ -3467,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true);
if (err < 0)
return err;
@@ -3549,8 +3617,8 @@ err1:
return err;
}
-static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nlattr *attr;
@@ -3561,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
@@ -3956,7 +4024,8 @@ static int nf_tables_abort(struct sk_buff *skb)
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list) {
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
diff --git a/kernel/net/netfilter/nf_tables_core.c b/kernel/net/netfilter/nf_tables_core.c
index f153b0707..f3695a497 100644
--- a/kernel/net/netfilter/nf_tables_core.c
+++ b/kernel/net/netfilter/nf_tables_core.c
@@ -48,9 +48,7 @@ static void __nft_trace_packet(const struct nft_pktinfo *pkt,
const struct nft_chain *chain,
int rulenum, enum nft_trace type)
{
- struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
-
- nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
chain->table->name, chain->name, comments[type],
rulenum);
@@ -111,10 +109,10 @@ struct nft_jumpstack {
};
unsigned int
-nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
- const struct nft_chain *chain = ops->priv, *basechain = chain;
- const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet);
+ const struct nft_chain *chain = priv, *basechain = chain;
+ const struct net *net = pkt->net;
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
diff --git a/kernel/net/netfilter/nf_tables_netdev.c b/kernel/net/netfilter/nf_tables_netdev.c
new file mode 100644
index 000000000..edb3502f2
--- /dev/null
+++ b/kernel/net/netfilter/nf_tables_netdev.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+static inline void
+nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct iphdr *iph, _iph;
+ u32 len, thoff;
+
+ nft_set_pktinfo(pkt, skb, state);
+
+ iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph),
+ &_iph);
+ if (!iph)
+ return;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return;
+
+ len = ntohs(iph->tot_len);
+ thoff = iph->ihl * 4;
+ if (skb->len < len)
+ return;
+ else if (len < thoff)
+ return;
+
+ pkt->tprot = iph->protocol;
+ pkt->xt.thoff = thoff;
+ pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET;
+}
+
+static inline void
+__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *ip6h, _ip6h;
+ unsigned int thoff = 0;
+ unsigned short frag_off;
+ int protohdr;
+ u32 pkt_len;
+
+ ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h),
+ &_ip6h);
+ if (!ip6h)
+ return;
+
+ if (ip6h->version != 6)
+ return;
+
+ pkt_len = ntohs(ip6h->payload_len);
+ if (pkt_len + sizeof(*ip6h) > skb->len)
+ return;
+
+ protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+ if (protohdr < 0)
+ return;
+
+ pkt->tprot = protohdr;
+ pkt->xt.thoff = thoff;
+ pkt->xt.fragoff = frag_off;
+#endif
+}
+
+static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ nft_set_pktinfo(pkt, skb, state);
+ __nft_netdev_set_pktinfo_ipv6(pkt, skb, state);
+}
+
+static unsigned int
+nft_do_chain_netdev(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
+ break;
+ case htons(ETH_P_IPV6):
+ nft_netdev_set_pktinfo_ipv6(&pkt, skb, state);
+ break;
+ default:
+ nft_set_pktinfo(&pkt, skb, state);
+ break;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
+static struct nft_af_info nft_af_netdev __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .nhooks = NF_NETDEV_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .flags = NFT_AF_NEEDS_DEV,
+ .nops = 1,
+ .hooks = {
+ [NF_NETDEV_INGRESS] = nft_do_chain_netdev,
+ },
+};
+
+static int nf_tables_netdev_init_net(struct net *net)
+{
+ net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.netdev == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev));
+
+ if (nft_register_afinfo(net, net->nft.netdev) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.netdev);
+ return -ENOMEM;
+}
+
+static void nf_tables_netdev_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.netdev);
+ kfree(net->nft.netdev);
+}
+
+static struct pernet_operations nf_tables_netdev_net_ops = {
+ .init = nf_tables_netdev_init_net,
+ .exit = nf_tables_netdev_exit_net,
+};
+
+static const struct nf_chain_type nft_filter_chain_netdev = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_NETDEV,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_NETDEV_INGRESS),
+};
+
+static void nft_netdev_event(unsigned long event, struct nft_af_info *afi,
+ struct net_device *dev, struct nft_table *table,
+ struct nft_base_chain *basechain)
+{
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (strcmp(basechain->dev_name, dev->name) != 0)
+ return;
+
+ BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED));
+
+ dev_hold(dev);
+ basechain->ops[0].dev = dev;
+ basechain->flags &= ~NFT_BASECHAIN_DISABLED;
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nft_register_basechain(basechain, afi->nops);
+ break;
+ case NETDEV_UNREGISTER:
+ if (strcmp(basechain->dev_name, dev->name) != 0)
+ return;
+
+ BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED);
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nft_unregister_basechain(basechain, afi->nops);
+
+ dev_put(basechain->ops[0].dev);
+ basechain->ops[0].dev = NULL;
+ basechain->flags |= NFT_BASECHAIN_DISABLED;
+ break;
+ case NETDEV_CHANGENAME:
+ if (dev->ifindex != basechain->ops[0].dev->ifindex)
+ return;
+
+ strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+ break;
+ }
+}
+
+static int nf_tables_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
+ if (afi->family != NFPROTO_NETDEV)
+ continue;
+
+ list_for_each_entry(table, &afi->tables, list) {
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ nft_netdev_event(event, afi, dev, table,
+ nft_base_chain(chain));
+ }
+ }
+ }
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_tables_netdev_notifier = {
+ .notifier_call = nf_tables_netdev_event,
+};
+
+static int __init nf_tables_netdev_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&nft_filter_chain_netdev);
+ ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+
+ register_netdevice_notifier(&nf_tables_netdev_notifier);
+
+ return ret;
+}
+
+static void __exit nf_tables_netdev_exit(void)
+{
+ unregister_netdevice_notifier(&nf_tables_netdev_notifier);
+ unregister_pernet_subsys(&nf_tables_netdev_net_ops);
+ nft_unregister_chain_type(&nft_filter_chain_netdev);
+}
+
+module_init(nf_tables_netdev_init);
+module_exit(nf_tables_netdev_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */
diff --git a/kernel/net/netfilter/nfnetlink.c b/kernel/net/netfilter/nfnetlink.c
index 8b117c90e..77afe913d 100644
--- a/kernel/net/netfilter/nfnetlink.c
+++ b/kernel/net/netfilter/nfnetlink.c
@@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id)
EXPORT_SYMBOL_GPL(nfnl_unlock);
#ifdef CONFIG_PROVE_LOCKING
-int lockdep_nfnl_is_held(u8 subsys_id)
+bool lockdep_nfnl_is_held(u8 subsys_id)
{
return lockdep_is_held(&table[subsys_id].mutex);
}
@@ -269,6 +269,12 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
}
}
+enum {
+ NFNL_BATCH_FAILURE = (1 << 0),
+ NFNL_BATCH_DONE = (1 << 1),
+ NFNL_BATCH_REPLAY = (1 << 2),
+};
+
static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
u_int16_t subsys_id)
{
@@ -276,19 +282,19 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
- bool success = true, done = false;
static LIST_HEAD(err_list);
+ u32 status;
int err;
if (subsys_id >= NFNL_SUBSYS_COUNT)
return netlink_ack(skb, nlh, -EINVAL);
replay:
+ status = 0;
+
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM);
- skb->sk = oskb->sk;
-
nfnl_lock(subsys_id);
ss = rcu_dereference_protected(table[subsys_id].subsys,
lockdep_is_held(&table[subsys_id].mutex));
@@ -336,10 +342,10 @@ replay:
if (type == NFNL_MSG_BATCH_BEGIN) {
/* Malformed: Batch begin twice */
nfnl_err_reset(&err_list);
- success = false;
+ status |= NFNL_BATCH_FAILURE;
goto done;
} else if (type == NFNL_MSG_BATCH_END) {
- done = true;
+ status |= NFNL_BATCH_DONE;
goto done;
} else if (type < NLMSG_MIN_TYPE) {
err = -EINVAL;
@@ -373,7 +379,7 @@ replay:
goto ack;
if (nc->call_batch) {
- err = nc->call_batch(net->nfnl, skb, nlh,
+ err = nc->call_batch(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
}
@@ -382,11 +388,8 @@ replay:
* original skb.
*/
if (err == -EAGAIN) {
- nfnl_err_reset(&err_list);
- ss->abort(oskb);
- nfnl_unlock(subsys_id);
- kfree_skb(skb);
- goto replay;
+ status |= NFNL_BATCH_REPLAY;
+ goto next;
}
}
ack:
@@ -402,7 +405,7 @@ ack:
*/
nfnl_err_reset(&err_list);
netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
- success = false;
+ status |= NFNL_BATCH_FAILURE;
goto done;
}
/* We don't stop processing the batch on errors, thus,
@@ -410,19 +413,26 @@ ack:
* triggers.
*/
if (err)
- success = false;
+ status |= NFNL_BATCH_FAILURE;
}
-
+next:
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
done:
- if (success && done)
+ if (status & NFNL_BATCH_REPLAY) {
+ ss->abort(oskb);
+ nfnl_err_reset(&err_list);
+ nfnl_unlock(subsys_id);
+ kfree_skb(skb);
+ goto replay;
+ } else if (status == NFNL_BATCH_DONE) {
ss->commit(oskb);
- else
+ } else {
ss->abort(oskb);
+ }
nfnl_err_deliver(&err_list, oskb);
nfnl_unlock(subsys_id);
@@ -432,6 +442,7 @@ done:
static void nfnetlink_rcv(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ u_int16_t res_id;
int msglen;
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
@@ -456,7 +467,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
- nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+ /* Work around old nft using host byte order */
+ if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ res_id = NFNL_SUBSYS_NFTABLES;
+ else
+ res_id = ntohs(nfgenmsg->res_id);
+ nfnetlink_rcv_batch(skb, nlh, res_id);
} else {
netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
}
@@ -474,7 +490,7 @@ static int nfnetlink_bind(struct net *net, int group)
type = nfnl_group2type[group];
rcu_read_lock();
- ss = nfnetlink_get_subsys(type);
+ ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
request_module("nfnetlink-subsys-%d", type);
diff --git a/kernel/net/netfilter/nfnetlink_acct.c b/kernel/net/netfilter/nfnetlink_acct.c
index c18af2f63..fefbf5f0b 100644
--- a/kernel/net/netfilter/nfnetlink_acct.c
+++ b/kernel/net/netfilter/nfnetlink_acct.c
@@ -27,8 +27,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
-static LIST_HEAD(nfnl_acct_list);
-
struct nf_acct {
atomic64_t pkts;
atomic64_t bytes;
@@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
struct nf_acct *nfacct, *matching = NULL;
+ struct net *net = sock_net(nfnl);
char *acct_name;
unsigned int size = 0;
u32 flags = 0;
@@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
atomic_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
return 0;
}
@@ -185,6 +184,7 @@ nla_put_failure:
static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -257,6 +257,7 @@ static int
nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
+ struct net *net = sock_net(nfnl);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
@@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &nfnl_acct_list, head) {
+ list_for_each_entry(cur, &net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -336,19 +337,20 @@ static int
nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
const struct nlmsghdr *nlh, const struct nlattr * const tb[])
{
+ struct net *net = sock_net(nfnl);
char *acct_name;
struct nf_acct *cur;
int ret = -ENOENT;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry(cur, &nfnl_acct_list, head)
+ list_for_each_entry(cur, &net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &nfnl_acct_list, head) {
+ list_for_each_entry(cur, &net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = {
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
-struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
void nfnl_acct_put(struct nf_acct *acct)
{
- atomic_dec(&acct->refcnt);
+ if (atomic_dec_and_test(&acct->refcnt))
+ kfree_rcu(acct, rcu_head);
+
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(nfnl_acct_put);
@@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
}
EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+static int __net_init nfnl_acct_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nfnl_acct_list);
+
+ return 0;
+}
+
+static void __net_exit nfnl_acct_net_exit(struct net *net)
+{
+ struct nf_acct *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_del_rcu(&cur->head);
+
+ if (atomic_dec_and_test(&cur->refcnt))
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+static struct pernet_operations nfnl_acct_ops = {
+ .init = nfnl_acct_net_init,
+ .exit = nfnl_acct_net_exit,
+};
+
static int __init nfnl_acct_init(void)
{
int ret;
+ ret = register_pernet_subsys(&nfnl_acct_ops);
+ if (ret < 0) {
+ pr_err("nfnl_acct_init: failed to register pernet ops\n");
+ goto err_out;
+ }
+
pr_info("nfnl_acct: registering with nfnetlink.\n");
ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
if (ret < 0) {
pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
- goto err_out;
+ goto cleanup_pernet;
}
return 0;
+
+cleanup_pernet:
+ unregister_pernet_subsys(&nfnl_acct_ops);
err_out:
return ret;
}
static void __exit nfnl_acct_exit(void)
{
- struct nf_acct *cur, *tmp;
-
pr_info("nfnl_acct: unregistering from nfnetlink.\n");
nfnetlink_subsys_unregister(&nfnl_acct_subsys);
-
- list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
- list_del_rcu(&cur->head);
- /* We are sure that our objects have no clients at this point,
- * it's safe to release them all without checking refcnt. */
- kfree_rcu(cur, rcu_head);
- }
+ unregister_pernet_subsys(&nfnl_acct_ops);
}
module_init(nfnl_acct_init);
diff --git a/kernel/net/netfilter/nfnetlink_cttimeout.c b/kernel/net/netfilter/nfnetlink_cttimeout.c
index 476accd17..c7a2d0e1c 100644
--- a/kernel/net/netfilter/nfnetlink_cttimeout.c
+++ b/kernel/net/netfilter/nfnetlink_cttimeout.c
@@ -291,6 +291,34 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
return ret;
}
+static void untimeout(struct nf_conntrack_tuple_hash *i,
+ struct ctnl_timeout *timeout)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+}
+
+static void ctnl_untimeout(struct ctnl_timeout *timeout)
+{
+ struct nf_conntrack_tuple_hash *h;
+ const struct hlist_nulls_node *nn;
+ int i;
+
+ local_bh_disable();
+ for (i = 0; i < init_net.ct.htable_size; i++) {
+ spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < init_net.ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode)
+ untimeout(h, timeout);
+ }
+ spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ }
+ local_bh_enable();
+}
+
/* try to delete object, fail if it is still in use. */
static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
{
@@ -301,6 +329,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
nf_ct_l4proto_put(timeout->l4proto);
+ ctnl_untimeout(timeout);
kfree_rcu(timeout, rcu_head);
} else {
/* still in use, restore reference counter. */
@@ -567,6 +596,10 @@ static void __exit cttimeout_exit(void)
pr_info("cttimeout: unregistering from nfnetlink.\n");
nfnetlink_subsys_unregister(&cttimeout_subsys);
+
+ /* Make sure no conntrack objects refer to custom timeouts anymore. */
+ ctnl_untimeout(NULL);
+
list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
list_del_rcu(&cur->head);
/* We are sure that our objects have no clients at this point,
@@ -579,6 +612,7 @@ static void __exit cttimeout_exit(void)
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+ rcu_barrier();
}
module_init(cttimeout_init);
diff --git a/kernel/net/netfilter/nfnetlink_log.c b/kernel/net/netfilter/nfnetlink_log.c
index 4ef1fae84..740cce468 100644
--- a/kernel/net/netfilter/nfnetlink_log.c
+++ b/kernel/net/netfilter/nfnetlink_log.c
@@ -27,6 +27,7 @@
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/spinlock.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
@@ -401,7 +402,9 @@ __build_packet_message(struct nfnl_log_net *log,
unsigned int hooknum,
const struct net_device *indev,
const struct net_device *outdev,
- const char *prefix, unsigned int plen)
+ const char *prefix, unsigned int plen,
+ const struct nfnl_ct_hook *nfnl_ct,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
@@ -538,9 +541,9 @@ __build_packet_message(struct nfnl_log_net *log,
if (skb->tstamp.tv64) {
struct nfulnl_msg_packet_timestamp ts;
- struct timeval tv = ktime_to_timeval(skb->tstamp);
- ts.sec = cpu_to_be64(tv.tv_sec);
- ts.usec = cpu_to_be64(tv.tv_usec);
+ struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ ts.sec = cpu_to_be64(kts.tv_sec);
+ ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
goto nla_put_failure;
@@ -575,6 +578,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(atomic_inc_return(&log->global_seq))))
goto nla_put_failure;
+ if (ct && nfnl_ct->build(inst->skb, ct, ctinfo,
+ NFULA_CT, NFULA_CT_INFO) < 0)
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
int size = nla_attr_size(data_len);
@@ -598,8 +605,6 @@ nla_put_failure:
return -1;
}
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_ULOG,
.u = {
@@ -622,12 +627,16 @@ nfulnl_log_packet(struct net *net,
const struct nf_loginfo *li_user,
const char *prefix)
{
- unsigned int size, data_len;
+ size_t size;
+ unsigned int data_len;
struct nfulnl_instance *inst;
const struct nf_loginfo *li;
unsigned int qthreshold;
unsigned int plen;
struct nfnl_log_net *log = nfnl_log_pernet(net);
+ const struct nfnl_ct_hook *nfnl_ct = NULL;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
@@ -673,6 +682,14 @@ nfulnl_log_packet(struct net *net,
size += nla_total_size(sizeof(u_int32_t));
if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
size += nla_total_size(sizeof(u_int32_t));
+ if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+ if (nfnl_ct != NULL) {
+ ct = nfnl_ct->get_ct(skb, &ctinfo);
+ if (ct != NULL)
+ size += nfnl_ct->build_size(ct);
+ }
+ }
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
@@ -717,7 +734,8 @@ nfulnl_log_packet(struct net *net,
inst->qlen++;
__build_packet_message(log, inst, skb, data_len, pf,
- hooknum, in, out, prefix, plen);
+ hooknum, in, out, prefix, plen,
+ nfnl_ct, ct, ctinfo);
if (inst->qlen >= qthreshold)
__nfulnl_flush(inst);
@@ -807,6 +825,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
struct net *net = sock_net(ctnl);
struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
+ u16 flags = 0;
if (nfula[NFULA_CFG_CMD]) {
u_int8_t pf = nfmsg->nfgen_family;
@@ -828,6 +847,28 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
+ /* Check if we support these flags in first place, dependencies should
+ * be there too not to break atomicity.
+ */
+ if (nfula[NFULA_CFG_FLAGS]) {
+ flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS]));
+
+ if ((flags & NFULNL_CFG_F_CONNTRACK) &&
+ !rcu_access_pointer(nfnl_ct_hook)) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(NFNL_SUBSYS_ULOG);
+ request_module("ip_conntrack_netlink");
+ nfnl_lock(NFNL_SUBSYS_ULOG);
+ if (rcu_access_pointer(nfnl_ct_hook)) {
+ ret = -EAGAIN;
+ goto out_put;
+ }
+#endif
+ ret = -EOPNOTSUPP;
+ goto out_put;
+ }
+ }
+
if (cmd != NULL) {
switch (cmd->command) {
case NFULNL_CFG_CMD_BIND:
@@ -856,16 +897,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -ENOTSUPP;
break;
}
+ } else if (!inst) {
+ ret = -ENODEV;
+ goto out;
}
if (nfula[NFULA_CFG_MODE]) {
- struct nfulnl_msg_config_mode *params;
- params = nla_data(nfula[NFULA_CFG_MODE]);
+ struct nfulnl_msg_config_mode *params =
+ nla_data(nfula[NFULA_CFG_MODE]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_mode(inst, params->copy_mode,
ntohl(params->copy_range));
}
@@ -873,42 +913,23 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
if (nfula[NFULA_CFG_TIMEOUT]) {
__be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_timeout(inst, ntohl(timeout));
}
if (nfula[NFULA_CFG_NLBUFSIZ]) {
__be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
}
if (nfula[NFULA_CFG_QTHRESH]) {
__be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
nfulnl_set_qthresh(inst, ntohl(qthresh));
}
- if (nfula[NFULA_CFG_FLAGS]) {
- __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]);
-
- if (!inst) {
- ret = -ENODEV;
- goto out;
- }
- nfulnl_set_flags(inst, ntohs(flags));
- }
+ if (nfula[NFULA_CFG_FLAGS])
+ nfulnl_set_flags(inst, flags);
out_put:
instance_put(inst);
diff --git a/kernel/net/netfilter/nfnetlink_queue_core.c b/kernel/net/netfilter/nfnetlink_queue.c
index 11c7682fa..861c66152 100644
--- a/kernel/net/netfilter/nfnetlink_queue_core.c
+++ b/kernel/net/netfilter/nfnetlink_queue.c
@@ -28,12 +28,12 @@
#include <linux/netfilter_bridge.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
#include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_queue.h>
#include <linux/atomic.h>
@@ -278,13 +278,30 @@ nla_put_failure:
return -1;
}
+static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+{
+ u32 seclen = 0;
+#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+ if (!skb || !sk_fullsock(skb->sk))
+ return 0;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+
+ if (skb->secmark)
+ security_secid_to_secctx(skb->secmark, secdata, &seclen);
+
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+#endif
+ return seclen;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
__be32 **packet_id_ptr)
{
size_t size;
- size_t data_len = 0, cap_len = 0;
+ size_t data_len = 0, cap_len = 0, rem_len = 0;
unsigned int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
@@ -296,7 +313,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct net_device *outdev;
struct nf_conn *ct = NULL;
enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
+ char *secdata = NULL;
+ u32 seclen = 0;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -341,18 +361,32 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
hlen = min_t(unsigned int, hlen, data_len);
size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
+ rem_len = data_len - hlen;
break;
}
- if (queue->flags & NFQA_CFG_F_CONNTRACK)
- ct = nfqnl_ct_get(entskb, &size, &ctinfo);
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
+ if (queue->flags & NFQA_CFG_F_CONNTRACK) {
+ if (nfnl_ct != NULL) {
+ ct = nfnl_ct->get_ct(entskb, &ctinfo);
+ if (ct != NULL)
+ size += nfnl_ct->build_size(ct);
+ }
+ }
if (queue->flags & NFQA_CFG_F_UID_GID) {
size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ nla_total_size(sizeof(u_int32_t))); /* gid */
}
- skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+ if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
+ seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ if (seclen)
+ size += nla_total_size(seclen);
+ }
+
+ skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
GFP_ATOMIC);
if (!skb) {
skb_tx_error(entskb);
@@ -467,9 +501,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (entskb->tstamp.tv64) {
struct nfqnl_msg_packet_timestamp ts;
- struct timeval tv = ktime_to_timeval(entskb->tstamp);
- ts.sec = cpu_to_be64(tv.tv_sec);
- ts.usec = cpu_to_be64(tv.tv_usec);
+ struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+
+ ts.sec = cpu_to_be64(kts.tv_sec);
+ ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
goto nla_put_failure;
@@ -479,7 +514,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ goto nla_put_failure;
+
+ if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
goto nla_put_failure;
if (cap_len > data_len &&
@@ -569,12 +607,9 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry) {
- if (nf_queue_entry_get_refs(entry))
- return entry;
- kfree(entry);
- }
- return NULL;
+ if (entry)
+ nf_queue_entry_get_refs(entry);
+ return entry;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -641,8 +676,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
struct nfqnl_instance *queue;
struct sk_buff *skb, *segs;
int err = -ENOBUFS;
- struct net *net = dev_net(entry->state.in ?
- entry->state.in : entry->state.out);
+ struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
/* rcu_read_lock()ed by nf_hook_slow() */
@@ -670,7 +704,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
nf_bridge_adjust_skb_data(skb);
segs = skb_gso_segment(skb, 0);
/* Does not use PTR_ERR to limit the number of error codes that can be
- * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * returned by nf_queue. For instance, callers rely on -ESRCH to
* mean 'ignore this hook'.
*/
if (IS_ERR_OR_NULL(segs))
@@ -806,8 +840,6 @@ nfqnl_dev_drop(struct net *net, int ifindex)
rcu_read_unlock();
}
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
static int
nfqnl_rcv_dev_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -824,6 +856,27 @@ static struct notifier_block nfqnl_dev_notifier = {
.notifier_call = nfqnl_rcv_dev_event,
};
+static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr)
+{
+ return entry->elem == (struct nf_hook_ops *)ops_ptr;
+}
+
+static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook)
+{
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_rcu(inst, head, hlist)
+ nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook);
+ }
+ rcu_read_unlock();
+}
+
static int
nfqnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -954,6 +1007,28 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
+static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[],
+ struct nf_queue_entry *entry,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conn *ct;
+
+ ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+ if (ct == NULL)
+ return NULL;
+
+ if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0)
+ return NULL;
+
+ if (nfqa[NFQA_EXP])
+ nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct,
+ NETLINK_CB(entry->skb).portid,
+ nlmsg_report(nlh));
+ return ct;
+}
+
static int
nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -967,6 +1042,7 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
unsigned int verdict;
struct nf_queue_entry *entry;
enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
struct net *net = sock_net(ctnl);
@@ -989,13 +1065,12 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
if (entry == NULL)
return -ENOENT;
+ /* rcu lock already held from nfnl->call_rcu. */
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (nfqa[NFQA_CT]) {
- ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
- if (ct && nfqa[NFQA_EXP]) {
- nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
- NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
- }
+ if (nfnl_ct != NULL)
+ ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
if (nfqa[NFQA_PAYLOAD]) {
@@ -1006,8 +1081,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
payload_len, entry, diff) < 0)
verdict = NF_DROP;
- if (ct)
- nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff);
+ if (ct && diff)
+ nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
}
if (nfqa[NFQA_MARK])
@@ -1031,7 +1106,8 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
};
static const struct nf_queue_handler nfqh = {
- .outfn = &nfqnl_enqueue_packet,
+ .outfn = &nfqnl_enqueue_packet,
+ .nf_hook_drop = &nfqnl_nf_hook_drop,
};
static int
@@ -1142,7 +1218,12 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -EOPNOTSUPP;
goto err_out_unlock;
}
-
+#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
+ if (flags & mask & NFQA_CFG_F_SECCTX) {
+ ret = -EOPNOTSUPP;
+ goto err_out_unlock;
+ }
+#endif
spin_lock_bh(&queue->lock);
queue->flags &= ~mask;
queue->flags |= flags & mask;
@@ -1257,7 +1338,7 @@ static int seq_show(struct seq_file *s, void *v)
inst->copy_mode, inst->copy_range,
inst->queue_dropped, inst->queue_user_dropped,
inst->id_sequence, 1);
- return seq_has_overflowed(s);
+ return 0;
}
static const struct seq_operations nfqnl_seq_ops = {
@@ -1338,6 +1419,7 @@ static int __init nfnetlink_queue_init(void)
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
return status;
}
diff --git a/kernel/net/netfilter/nfnetlink_queue_ct.c b/kernel/net/netfilter/nfnetlink_queue_ct.c
deleted file mode 100644
index 96cac50e0..000000000
--- a/kernel/net/netfilter/nfnetlink_queue_ct.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/skbuff.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_queue.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nfnetlink_queue.h>
-
-struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
- enum ip_conntrack_info *ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nf_conn *ct;
-
- /* rcu_read_lock()ed by __nf_queue already. */
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return NULL;
-
- ct = nf_ct_get(entskb, ctinfo);
- if (ct) {
- if (!nf_ct_is_untracked(ct))
- *size += nfq_ct->build_size(ct);
- else
- ct = NULL;
- }
- return ct;
-}
-
-struct nf_conn *
-nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
- enum ip_conntrack_info *ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nf_conn *ct;
-
- /* rcu_read_lock()ed by __nf_queue already. */
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return NULL;
-
- ct = nf_ct_get(skb, ctinfo);
- if (ct && !nf_ct_is_untracked(ct))
- nfq_ct->parse(attr, ct);
-
- return ct;
-}
-
-int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct nfq_ct_hook *nfq_ct;
- struct nlattr *nest_parms;
- u_int32_t tmp;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return 0;
-
- nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
- if (!nest_parms)
- goto nla_put_failure;
-
- if (nfq_ct->build(skb, ct) < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest_parms);
-
- tmp = ctinfo;
- if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo, int diff)
-{
- struct nfq_ct_hook *nfq_ct;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return;
-
- if ((ct->status & IPS_NAT_MASK) && diff)
- nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
-}
-
-int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
- u32 portid, u32 report)
-{
- struct nfq_ct_hook *nfq_ct;
-
- if (nf_ct_is_untracked(ct))
- return 0;
-
- nfq_ct = rcu_dereference(nfq_ct_hook);
- if (nfq_ct == NULL)
- return -EOPNOTSUPP;
-
- return nfq_ct->attach_expect(attr, ct, portid, report);
-}
diff --git a/kernel/net/netfilter/nft_compat.c b/kernel/net/netfilter/nft_compat.c
index 7f29cfc76..9c8fab001 100644
--- a/kernel/net/netfilter/nft_compat.c
+++ b/kernel/net/netfilter/nft_compat.c
@@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
par->hook_mask = 0;
}
par->family = ctx->afi->family;
+ par->nft_compat = true;
}
static void target_compat_from_user(struct xt_target *t, void *in, void *out)
@@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
par->hook_mask = 0;
}
par->family = ctx->afi->family;
+ par->nft_compat = true;
}
static void match_compat_from_user(struct xt_match *m, void *in, void *out)
@@ -617,6 +619,13 @@ struct nft_xt {
static struct nft_expr_type nft_match_type;
+static bool nft_match_cmp(const struct xt_match *match,
+ const char *name, u32 rev, u32 family)
+{
+ return strcmp(match->name, name) == 0 && match->revision == rev &&
+ (match->family == NFPROTO_UNSPEC || match->family == family);
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -624,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
struct nft_xt *nft_match;
struct xt_match *match;
char *mt_name;
- __u32 rev, family;
+ u32 rev, family;
if (tb[NFTA_MATCH_NAME] == NULL ||
tb[NFTA_MATCH_REV] == NULL ||
@@ -639,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
list_for_each_entry(nft_match, &nft_match_list, head) {
struct xt_match *match = nft_match->ops.data;
- if (strcmp(match->name, mt_name) == 0 &&
- match->revision == rev && match->family == family) {
+ if (nft_match_cmp(match, mt_name, rev, family)) {
if (!try_module_get(match->me))
return ERR_PTR(-ENOENT);
@@ -691,6 +699,13 @@ static LIST_HEAD(nft_target_list);
static struct nft_expr_type nft_target_type;
+static bool nft_target_cmp(const struct xt_target *tg,
+ const char *name, u32 rev, u32 family)
+{
+ return strcmp(tg->name, name) == 0 && tg->revision == rev &&
+ (tg->family == NFPROTO_UNSPEC || tg->family == family);
+}
+
static const struct nft_expr_ops *
nft_target_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -698,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
struct nft_xt *nft_target;
struct xt_target *target;
char *tg_name;
- __u32 rev, family;
+ u32 rev, family;
if (tb[NFTA_TARGET_NAME] == NULL ||
tb[NFTA_TARGET_REV] == NULL ||
@@ -713,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
list_for_each_entry(nft_target, &nft_target_list, head) {
struct xt_target *target = nft_target->ops.data;
- if (strcmp(target->name, tg_name) == 0 &&
- target->revision == rev && target->family == family) {
+ if (nft_target_cmp(target, tg_name, rev, family)) {
if (!try_module_get(target->me))
return ERR_PTR(-ENOENT);
diff --git a/kernel/net/netfilter/nft_counter.c b/kernel/net/netfilter/nft_counter.c
index 175912392..c7808fc19 100644
--- a/kernel/net/netfilter/nft_counter.c
+++ b/kernel/net/netfilter/nft_counter.c
@@ -18,39 +18,66 @@
#include <net/netfilter/nf_tables.h>
struct nft_counter {
- seqlock_t lock;
u64 bytes;
u64 packets;
};
+struct nft_counter_percpu {
+ struct nft_counter counter;
+ struct u64_stats_sync syncp;
+};
+
+struct nft_counter_percpu_priv {
+ struct nft_counter_percpu __percpu *counter;
+};
+
static void nft_counter_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_counter *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ u64_stats_update_begin(&this_cpu->syncp);
+ this_cpu->counter.bytes += pkt->skb->len;
+ this_cpu->counter.packets++;
+ u64_stats_update_end(&this_cpu->syncp);
+ local_bh_enable();
+}
- write_seqlock_bh(&priv->lock);
- priv->bytes += pkt->skb->len;
- priv->packets++;
- write_sequnlock_bh(&priv->lock);
+static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+ struct nft_counter *total)
+{
+ const struct nft_counter_percpu *cpu_stats;
+ u64 bytes, packets;
+ unsigned int seq;
+ int cpu;
+
+ memset(total, 0, sizeof(*total));
+ for_each_possible_cpu(cpu) {
+ cpu_stats = per_cpu_ptr(counter, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ bytes = cpu_stats->counter.bytes;
+ packets = cpu_stats->counter.packets;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+
+ total->packets += packets;
+ total->bytes += bytes;
+ }
}
static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
- struct nft_counter *priv = nft_expr_priv(expr);
- unsigned int seq;
- u64 bytes;
- u64 packets;
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter total;
- do {
- seq = read_seqbegin(&priv->lock);
- bytes = priv->bytes;
- packets = priv->packets;
- } while (read_seqretry(&priv->lock, seq));
+ nft_counter_fetch(priv->counter, &total);
- if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) ||
+ nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets)))
goto nla_put_failure;
return 0;
@@ -67,24 +94,71 @@ static int nft_counter_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_counter *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+
+ cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
+ if (cpu_stats == NULL)
+ return ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ if (tb[NFTA_COUNTER_PACKETS]) {
+ this_cpu->counter.packets =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ }
+ if (tb[NFTA_COUNTER_BYTES]) {
+ this_cpu->counter.bytes =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ }
+ preempt_enable();
+ priv->counter = cpu_stats;
+ return 0;
+}
- if (tb[NFTA_COUNTER_PACKETS])
- priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
- if (tb[NFTA_COUNTER_BYTES])
- priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+static void nft_counter_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- seqlock_init(&priv->lock);
+ free_percpu(priv->counter);
+}
+
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
+ struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+ struct nft_counter total;
+
+ nft_counter_fetch(priv->counter, &total);
+
+ cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
+ GFP_ATOMIC);
+ if (cpu_stats == NULL)
+ return ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu->counter.packets = total.packets;
+ this_cpu->counter.bytes = total.bytes;
+ preempt_enable();
+
+ priv_clone->counter = cpu_stats;
return 0;
}
static struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
.eval = nft_counter_eval,
.init = nft_counter_init,
+ .destroy = nft_counter_destroy,
.dump = nft_counter_dump,
+ .clone = nft_counter_clone,
};
static struct nft_expr_type nft_counter_type __read_mostly = {
diff --git a/kernel/net/netfilter/nft_ct.c b/kernel/net/netfilter/nft_ct.c
index 8cbca3432..939921532 100644
--- a/kernel/net/netfilter/nft_ct.c
+++ b/kernel/net/netfilter/nft_ct.c
@@ -366,6 +366,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
diff --git a/kernel/net/netfilter/nft_dynset.c b/kernel/net/netfilter/nft_dynset.c
index 513a8ef60..9dec3bd1b 100644
--- a/kernel/net/netfilter/nft_dynset.c
+++ b/kernel/net/netfilter/nft_dynset.c
@@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
}
ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL)
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr);
+ if (priv->expr != NULL &&
+ nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ return NULL;
return elem;
}
diff --git a/kernel/net/netfilter/nft_limit.c b/kernel/net/netfilter/nft_limit.c
index 435c1ccd6..5d67938f8 100644
--- a/kernel/net/netfilter/nft_limit.c
+++ b/kernel/net/netfilter/nft_limit.c
@@ -20,63 +20,79 @@
static DEFINE_SPINLOCK(limit_lock);
struct nft_limit {
+ u64 last;
u64 tokens;
+ u64 tokens_max;
u64 rate;
- u64 unit;
- unsigned long stamp;
+ u64 nsecs;
+ u32 burst;
};
-static void nft_limit_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ u64 now, tokens;
+ s64 delta;
spin_lock_bh(&limit_lock);
- if (time_after_eq(jiffies, priv->stamp)) {
- priv->tokens = priv->rate;
- priv->stamp = jiffies + priv->unit * HZ;
- }
-
- if (priv->tokens >= 1) {
- priv->tokens--;
+ now = ktime_get_ns();
+ tokens = limit->tokens + now - limit->last;
+ if (tokens > limit->tokens_max)
+ tokens = limit->tokens_max;
+
+ limit->last = now;
+ delta = tokens - cost;
+ if (delta >= 0) {
+ limit->tokens = delta;
spin_unlock_bh(&limit_lock);
- return;
+ return false;
}
+ limit->tokens = tokens;
spin_unlock_bh(&limit_lock);
-
- regs->verdict.code = NFT_BREAK;
+ return true;
}
-static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
- [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
- [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
-};
-
-static int nft_limit_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
+static int nft_limit_init(struct nft_limit *limit,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ u64 unit;
if (tb[NFTA_LIMIT_RATE] == NULL ||
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
- priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- priv->stamp = jiffies + priv->unit * HZ;
- priv->tokens = priv->rate;
+ limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+ limit->nsecs = unit * NSEC_PER_SEC;
+ if (limit->rate == 0 || limit->nsecs < unit)
+ return -EOVERFLOW;
+ limit->tokens = limit->tokens_max = limit->nsecs;
+
+ if (tb[NFTA_LIMIT_BURST]) {
+ u64 rate;
+
+ limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+
+ rate = limit->rate + limit->burst;
+ if (rate < limit->rate)
+ return -EOVERFLOW;
+
+ limit->rate = rate;
+ }
+ limit->last = ktime_get_ns();
+
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+ enum nft_limit_type type)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u64 rate = limit->rate - limit->burst;
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
+ nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)))
goto nla_put_failure;
return 0;
@@ -84,18 +100,114 @@ nla_put_failure:
return -1;
}
+struct nft_limit_pkts {
+ struct nft_limit limit;
+ u64 cost;
+};
+
+static void nft_limit_pkts_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit_pkts *priv = nft_expr_priv(expr);
+
+ if (nft_limit_eval(&priv->limit, priv->cost))
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+ [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
+ [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
+ [NFTA_LIMIT_BURST] = { .type = NLA_U32 },
+ [NFTA_LIMIT_TYPE] = { .type = NLA_U32 },
+};
+
+static int nft_limit_pkts_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_limit_init(&priv->limit, tb);
+ if (err < 0)
+ return err;
+
+ priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate);
+ return 0;
+}
+
+static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+
+ return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
+}
+
static struct nft_expr_type nft_limit_type;
-static const struct nft_expr_ops nft_limit_ops = {
+static const struct nft_expr_ops nft_limit_pkts_ops = {
+ .type = &nft_limit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .eval = nft_limit_pkts_eval,
+ .init = nft_limit_pkts_init,
+ .dump = nft_limit_pkts_dump,
+};
+
+static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+ u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate);
+
+ if (nft_limit_eval(priv, cost))
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ return nft_limit_init(priv, tb);
+}
+
+static int nft_limit_pkt_bytes_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit *priv = nft_expr_priv(expr);
+
+ return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
+}
+
+static const struct nft_expr_ops nft_limit_pkt_bytes_ops = {
.type = &nft_limit_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
- .eval = nft_limit_eval,
- .init = nft_limit_init,
- .dump = nft_limit_dump,
+ .eval = nft_limit_pkt_bytes_eval,
+ .init = nft_limit_pkt_bytes_init,
+ .dump = nft_limit_pkt_bytes_dump,
};
+static const struct nft_expr_ops *
+nft_limit_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_LIMIT_TYPE] == NULL)
+ return &nft_limit_pkts_ops;
+
+ switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) {
+ case NFT_LIMIT_PKTS:
+ return &nft_limit_pkts_ops;
+ case NFT_LIMIT_PKT_BYTES:
+ return &nft_limit_pkt_bytes_ops;
+ }
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct nft_expr_type nft_limit_type __read_mostly = {
.name = "limit",
- .ops = &nft_limit_ops,
+ .select_ops = nft_limit_select_ops,
.policy = nft_limit_policy,
.maxattr = NFTA_LIMIT_MAX,
.flags = NFT_EXPR_STATEFUL,
diff --git a/kernel/net/netfilter/nft_log.c b/kernel/net/netfilter/nft_log.c
index a13d6a386..319c22b4b 100644
--- a/kernel/net/netfilter/nft_log.c
+++ b/kernel/net/netfilter/nft_log.c
@@ -31,9 +31,8 @@ static void nft_log_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
const struct nft_log *priv = nft_expr_priv(expr);
- struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
- nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in,
+ nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
pkt->out, &priv->loginfo, "%s", priv->prefix);
}
diff --git a/kernel/net/netfilter/nft_meta.c b/kernel/net/netfilter/nft_meta.c
index 52561e1c3..9dfaf4d55 100644
--- a/kernel/net/netfilter/nft_meta.c
+++ b/kernel/net/netfilter/nft_meta.c
@@ -31,6 +31,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
const struct net_device *in = pkt->in, *out = pkt->out;
+ struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
switch (priv->key) {
@@ -42,7 +43,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(__be16 *)dest = skb->protocol;
break;
case NFT_META_NFPROTO:
- *dest = pkt->ops->pf;
+ *dest = pkt->pf;
break;
case NFT_META_L4PROTO:
*dest = pkt->tprot;
@@ -86,33 +87,35 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(u16 *)dest = out->type;
break;
case NFT_META_SKUID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kuid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsuid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsuid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
case NFT_META_SKGID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kgid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsgid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
#ifdef CONFIG_IP_ROUTE_CLASSID
case NFT_META_RTCLASSID: {
@@ -135,7 +138,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
}
- switch (pkt->ops->pf) {
+ switch (pkt->pf) {
case NFPROTO_IPV4:
if (ipv4_is_multicast(ip_hdr(skb)->daddr))
*dest = PACKET_MULTICAST;
@@ -166,11 +169,14 @@ void nft_meta_get_eval(const struct nft_expr *expr,
goto err;
*dest = out->group;
break;
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- *dest = skb->sk->sk_classid;
+ *dest = sk->sk_classid;
break;
+#endif
default:
WARN_ON(1);
goto err;
@@ -246,7 +252,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_CPU:
case NFT_META_IIFGROUP:
case NFT_META_OIFGROUP:
+#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
+#endif
len = sizeof(u32);
break;
case NFT_META_IIFNAME:
diff --git a/kernel/net/netfilter/nft_payload.c b/kernel/net/netfilter/nft_payload.c
index 94fb3b27a..09b4b07eb 100644
--- a/kernel/net/netfilter/nft_payload.c
+++ b/kernel/net/netfilter/nft_payload.c
@@ -9,6 +9,7 @@
*/
#include <linux/kernel.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
@@ -17,6 +18,53 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+/* add vlan header into the user buffer for if tag was removed by offloads */
+static bool
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+{
+ int mac_off = skb_mac_header(skb) - skb->data;
+ u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d;
+ struct vlan_ethhdr veth;
+
+ vlanh = (u8 *) &veth;
+ if (offset < ETH_HLEN) {
+ u8 ethlen = min_t(u8, len, ETH_HLEN - offset);
+
+ if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN))
+ return false;
+
+ veth.h_vlan_proto = skb->vlan_proto;
+
+ memcpy(dst_u8, vlanh + offset, ethlen);
+
+ len -= ethlen;
+ if (len == 0)
+ return true;
+
+ dst_u8 += ethlen;
+ offset = ETH_HLEN;
+ } else if (offset >= VLAN_ETH_HLEN) {
+ offset -= VLAN_HLEN;
+ goto skip;
+ }
+
+ veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth.h_vlan_encapsulated_proto = skb->protocol;
+
+ vlanh += offset;
+
+ vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset);
+ memcpy(dst_u8, vlanh, vlan_len);
+
+ len -= vlan_len;
+ if (!len)
+ return true;
+
+ dst_u8 += vlan_len;
+ skip:
+ return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
+}
+
static void nft_payload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -26,10 +74,18 @@ static void nft_payload_eval(const struct nft_expr *expr,
u32 *dest = &regs->data[priv->dreg];
int offset;
+ dest[priv->len / NFT_REG32_SIZE] = 0;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
goto err;
+
+ if (skb_vlan_tag_present(skb)) {
+ if (!nft_payload_copy_vlan(dest, skb,
+ priv->offset, priv->len))
+ goto err;
+ return;
+ }
offset = skb_mac_header(skb) - skb->data;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
@@ -43,7 +99,6 @@ static void nft_payload_eval(const struct nft_expr *expr,
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
goto err;
return;
diff --git a/kernel/net/netfilter/nft_queue.c b/kernel/net/netfilter/nft_queue.c
index 96805d21d..61d216eb7 100644
--- a/kernel/net/netfilter/nft_queue.c
+++ b/kernel/net/netfilter/nft_queue.c
@@ -42,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr,
queue = priv->queuenum + cpu % priv->queues_total;
} else {
queue = nfqueue_hash(pkt->skb, queue,
- priv->queues_total, pkt->ops->pf,
+ priv->queues_total, pkt->pf,
jhash_initval);
}
}
diff --git a/kernel/net/netfilter/nft_reject_inet.c b/kernel/net/netfilter/nft_reject_inet.c
index 635dbba93..759ca5248 100644
--- a/kernel/net/netfilter/nft_reject_inet.c
+++ b/kernel/net/netfilter/nft_reject_inet.c
@@ -22,38 +22,37 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
- struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
- switch (pkt->ops->pf) {
+ switch (pkt->pf) {
case NFPROTO_IPV4:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
nft_reject_icmp_code(priv->icmp_code),
- pkt->ops->hooknum);
+ pkt->hook);
break;
}
break;
case NFPROTO_IPV6:
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach6(net, pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code,
+ pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+ nf_send_reset6(pkt->net, pkt->skb, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
- nf_send_unreach6(net, pkt->skb,
+ nf_send_unreach6(pkt->net, pkt->skb,
nft_reject_icmpv6_code(priv->icmp_code),
- pkt->ops->hooknum);
+ pkt->hook);
break;
}
break;
diff --git a/kernel/net/netfilter/x_tables.c b/kernel/net/netfilter/x_tables.c
index 51a459c3c..d4aaad747 100644
--- a/kernel/net/netfilter/x_tables.c
+++ b/kernel/net/netfilter/x_tables.c
@@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_IPV6] = "ip6",
};
-/* Allow this many total (re)entries. */
-static const unsigned int xt_jumpstack_multiplier = 2;
-
/* Registration hooks for targets. */
int xt_register_target(struct xt_target *target)
{
@@ -658,35 +655,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
struct xt_table_info *xt_alloc_table_info(unsigned int size)
{
- struct xt_table_info *newinfo;
- int cpu;
+ struct xt_table_info *info = NULL;
+ size_t sz = sizeof(*info) + size;
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
- newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
- if (!newinfo)
- return NULL;
-
- newinfo->size = size;
-
- for_each_possible_cpu(cpu) {
- if (size <= PAGE_SIZE)
- newinfo->entries[cpu] = kmalloc_node(size,
- GFP_KERNEL,
- cpu_to_node(cpu));
- else
- newinfo->entries[cpu] = vmalloc_node(size,
- cpu_to_node(cpu));
-
- if (newinfo->entries[cpu] == NULL) {
- xt_free_table_info(newinfo);
+ if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (!info) {
+ info = vmalloc(sz);
+ if (!info)
return NULL;
- }
}
-
- return newinfo;
+ memset(info, 0, sizeof(*info));
+ info->size = size;
+ return info;
}
EXPORT_SYMBOL(xt_alloc_table_info);
@@ -694,18 +679,13 @@ void xt_free_table_info(struct xt_table_info *info)
{
int cpu;
- for_each_possible_cpu(cpu)
- kvfree(info->entries[cpu]);
-
if (info->jumpstack != NULL) {
for_each_possible_cpu(cpu)
kvfree(info->jumpstack[cpu]);
kvfree(info->jumpstack);
}
- free_percpu(info->stackptr);
-
- kfree(info);
+ kvfree(info);
}
EXPORT_SYMBOL(xt_free_table_info);
@@ -747,15 +727,14 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
DEFINE_PER_CPU(seqcount_t, xt_recseq);
EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+struct static_key xt_tee_enabled __read_mostly;
+EXPORT_SYMBOL_GPL(xt_tee_enabled);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
int cpu;
- i->stackptr = alloc_percpu(unsigned int);
- if (i->stackptr == NULL)
- return -ENOMEM;
-
size = sizeof(void **) * nr_cpu_ids;
if (size > PAGE_SIZE)
i->jumpstack = vzalloc(size);
@@ -764,8 +743,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
if (i->jumpstack == NULL)
return -ENOMEM;
- i->stacksize *= xt_jumpstack_multiplier;
- size = sizeof(void *) * i->stacksize;
+ /* ruleset without jumps -- no stack needed */
+ if (i->stacksize == 0)
+ return 0;
+
+ /* Jumpstack needs to be able to record two full callchains, one
+ * from the first rule set traversal, plus one table reentrancy
+ * via -j TEE without clobbering the callchain that brought us to
+ * TEE target.
+ *
+ * This is done by allocating two jumpstacks per cpu, on reentry
+ * the upper half of the stack is used.
+ *
+ * see the jumpstack setup in ipt_do_table() for more details.
+ */
+ size = sizeof(void *) * i->stacksize * 2u;
for_each_possible_cpu(cpu) {
if (size > PAGE_SIZE)
i->jumpstack[cpu] = vmalloc_node(size,
@@ -947,11 +939,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v)
{
struct xt_table *table = list_entry(v, struct xt_table, list);
- if (strlen(table->name)) {
+ if (*table->name)
seq_printf(seq, "%s\n", table->name);
- return seq_has_overflowed(seq);
- } else
- return 0;
+ return 0;
}
static const struct seq_operations xt_table_seq_ops = {
@@ -1087,10 +1077,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v)
if (trav->curr == trav->head)
return 0;
match = list_entry(trav->curr, struct xt_match, list);
- if (*match->name == '\0')
- return 0;
- seq_printf(seq, "%s\n", match->name);
- return seq_has_overflowed(seq);
+ if (*match->name)
+ seq_printf(seq, "%s\n", match->name);
}
return 0;
}
@@ -1142,10 +1130,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v)
if (trav->curr == trav->head)
return 0;
target = list_entry(trav->curr, struct xt_target, list);
- if (*target->name == '\0')
- return 0;
- seq_printf(seq, "%s\n", target->name);
- return seq_has_overflowed(seq);
+ if (*target->name)
+ seq_printf(seq, "%s\n", target->name);
}
return 0;
}
@@ -1207,7 +1193,6 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
if (!(hook_mask & 1))
continue;
ops[i].hook = fn;
- ops[i].owner = table->me;
ops[i].pf = table->af;
ops[i].hooknum = hooknum;
ops[i].priority = table->priority;
diff --git a/kernel/net/netfilter/xt_CT.c b/kernel/net/netfilter/xt_CT.c
index 75747aecd..e7ac07e53 100644
--- a/kernel/net/netfilter/xt_CT.c
+++ b/kernel/net/netfilter/xt_CT.c
@@ -171,6 +171,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
if (timeout_ext == NULL)
ret = -ENOMEM;
+ rcu_read_unlock();
+ return ret;
+
err_put_timeout:
__xt_ct_tg_timeout_put(timeout);
out:
@@ -181,10 +184,23 @@ out:
#endif
}
+static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
+{
+ switch (info->flags & (XT_CT_ZONE_DIR_ORIG |
+ XT_CT_ZONE_DIR_REPL)) {
+ case XT_CT_ZONE_DIR_ORIG:
+ return NF_CT_ZONE_DIR_ORIG;
+ case XT_CT_ZONE_DIR_REPL:
+ return NF_CT_ZONE_DIR_REPL;
+ default:
+ return NF_CT_DEFAULT_ZONE_DIR;
+ }
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_zone zone;
struct nf_conn *ct;
int ret = -EOPNOTSUPP;
@@ -194,7 +210,9 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
}
#ifndef CONFIG_NF_CONNTRACK_ZONES
- if (info->zone)
+ if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
+ XT_CT_ZONE_DIR_REPL |
+ XT_CT_ZONE_MARK))
goto err1;
#endif
@@ -202,11 +220,17 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
if (ret < 0)
goto err1;
- memset(&t, 0, sizeof(t));
- ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
- ret = PTR_ERR(ct);
- if (IS_ERR(ct))
+ memset(&zone, 0, sizeof(zone));
+ zone.id = info->zone;
+ zone.dir = xt_ct_flags_to_dir(info);
+ if (info->flags & XT_CT_ZONE_MARK)
+ zone.flags |= NF_CT_FLAG_MARK;
+
+ ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
+ if (!ct) {
+ ret = -ENOMEM;
goto err2;
+ }
ret = 0;
if ((info->ct_events || info->exp_events) &&
@@ -227,14 +251,14 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
if (ret < 0)
goto err3;
}
-
- nf_conntrack_tmpl_insert(par->net, ct);
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err3:
- nf_conntrack_free(ct);
+ nf_ct_tmpl_free(ct);
err2:
nf_ct_l3proto_module_put(par->family);
err1:
@@ -297,8 +321,10 @@ static void xt_ct_destroy_timeout(struct nf_conn *ct)
if (timeout_put) {
timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext)
+ if (timeout_ext) {
timeout_put(timeout_ext->timeout);
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
}
rcu_read_unlock();
#endif
diff --git a/kernel/net/netfilter/xt_IDLETIMER.c b/kernel/net/netfilter/xt_IDLETIMER.c
index f407ebc13..29d2c31f4 100644
--- a/kernel/net/netfilter/xt_IDLETIMER.c
+++ b/kernel/net/netfilter/xt_IDLETIMER.c
@@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
goto out;
}
+ sysfs_attr_init(&info->timer->attr.attr);
info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
if (!info->timer->attr.attr.name) {
ret = -ENOMEM;
diff --git a/kernel/net/netfilter/xt_LOG.c b/kernel/net/netfilter/xt_LOG.c
index c13b79440..1763ab82b 100644
--- a/kernel/net/netfilter/xt_LOG.c
+++ b/kernel/net/netfilter/xt_LOG.c
@@ -33,7 +33,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = loginfo->level;
diff --git a/kernel/net/netfilter/xt_NFLOG.c b/kernel/net/netfilter/xt_NFLOG.c
index fb7497c92..a1fa2c800 100644
--- a/kernel/net/netfilter/xt_NFLOG.c
+++ b/kernel/net/netfilter/xt_NFLOG.c
@@ -26,7 +26,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
diff --git a/kernel/net/netfilter/xt_TCPMSS.c b/kernel/net/netfilter/xt_TCPMSS.c
index e762de5ee..b7c43def0 100644
--- a/kernel/net/netfilter/xt_TCPMSS.c
+++ b/kernel/net/netfilter/xt_TCPMSS.c
@@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
return -1;
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
if (dst_mtu(skb_dst(skb)) <= minlen) {
@@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
inet_proto_csum_replace2(&tcph->check, skb,
htons(oldmss), htons(newmss),
- 0);
+ false);
return 0;
}
}
@@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb,
memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));
inet_proto_csum_replace2(&tcph->check, skb,
- htons(len), htons(len + TCPOLEN_MSS), 1);
+ htons(len), htons(len + TCPOLEN_MSS), true);
opt[0] = TCPOPT_MSS;
opt[1] = TCPOLEN_MSS;
opt[2] = (newmss & 0xff00) >> 8;
opt[3] = newmss & 0x00ff;
- inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0);
+ inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false);
oldval = ((__be16 *)tcph)[6];
tcph->doff += TCPOLEN_MSS/4;
inet_proto_csum_replace2(&tcph->check, skb,
- oldval, ((__be16 *)tcph)[6], 0);
+ oldval, ((__be16 *)tcph)[6], false);
return TCPOLEN_MSS;
}
@@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return -EINVAL;
}
+ if (par->nft_compat)
+ return 0;
+
xt_ematch_foreach(ematch, e)
if (find_syn_match(ematch))
return 0;
@@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return -EINVAL;
}
+ if (par->nft_compat)
+ return 0;
+
xt_ematch_foreach(ematch, e)
if (find_syn_match(ematch))
return 0;
diff --git a/kernel/net/netfilter/xt_TCPOPTSTRIP.c b/kernel/net/netfilter/xt_TCPOPTSTRIP.c
index 625fa1d63..eb92bffff 100644
--- a/kernel/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/kernel/net/netfilter/xt_TCPOPTSTRIP.c
@@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
n <<= 8;
}
inet_proto_csum_replace2(&tcph->check, skb, htons(o),
- htons(n), 0);
+ htons(n), false);
}
memset(opt + i, TCPOPT_NOP, optl);
}
diff --git a/kernel/net/netfilter/xt_TEE.c b/kernel/net/netfilter/xt_TEE.c
index 292934d23..3eff7b67c 100644
--- a/kernel/net/netfilter/xt_TEE.c
+++ b/kernel/net/netfilter/xt_TEE.c
@@ -10,26 +10,15 @@
* modify it under the terms of the GNU General Public License
* version 2 or later, as published by the Free Software Foundation.
*/
-#include <linux/ip.h>
#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/route.h>
#include <linux/skbuff.h>
-#include <linux/notifier.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/route.h>
+#include <linux/route.h>
#include <linux/netfilter/x_tables.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#include <net/netfilter/ipv6/nf_dup_ipv6.h>
#include <linux/netfilter/xt_TEE.h>
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-# define WITH_CONNTRACK 1
-# include <net/netfilter/nf_conntrack.h>
-#endif
-
struct xt_tee_priv {
struct notifier_block notifier;
struct xt_tee_tginfo *tginfo;
@@ -37,162 +26,27 @@ struct xt_tee_priv {
};
static const union nf_inet_addr tee_zero_address;
-static DEFINE_PER_CPU(bool, tee_active);
-
-static struct net *pick_net(struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_NS
- const struct dst_entry *dst;
-
- if (skb->dev != NULL)
- return dev_net(skb->dev);
- dst = skb_dst(skb);
- if (dst != NULL && dst->dev != NULL)
- return dev_net(dst->dev);
-#endif
- return &init_net;
-}
-
-static bool
-tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct net *net = pick_net(skb);
- struct rtable *rt;
- struct flowi4 fl4;
-
- memset(&fl4, 0, sizeof(fl4));
- if (info->priv) {
- if (info->priv->oif == -1)
- return false;
- fl4.flowi4_oif = info->priv->oif;
- }
- fl4.daddr = info->gw.ip;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
- fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- return false;
-
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- skb->dev = rt->dst.dev;
- skb->protocol = htons(ETH_P_IP);
- return true;
-}
static unsigned int
tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
- struct iphdr *iph;
+ int oif = info->priv ? info->priv->oif : 0;
- if (__this_cpu_read(tee_active))
- return XT_CONTINUE;
- /*
- * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
- * the original skb, which should continue on its way as if nothing has
- * happened. The copy should be independently delivered to the TEE
- * --gateway.
- */
- skb = pskb_copy(skb, GFP_ATOMIC);
- if (skb == NULL)
- return XT_CONTINUE;
-
-#ifdef WITH_CONNTRACK
- /* Avoid counting cloned packets towards the original connection. */
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
-#endif
- /*
- * If we are in PREROUTING/INPUT, the checksum must be recalculated
- * since the length could have changed as a result of defragmentation.
- *
- * We also decrease the TTL to mitigate potential TEE loops
- * between two hosts.
- *
- * Set %IP_DF so that the original source is notified of a potentially
- * decreased MTU on the clone route. IPv6 does this too.
- */
- iph = ip_hdr(skb);
- iph->frag_off |= htons(IP_DF);
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_IN)
- --iph->ttl;
- ip_send_check(iph);
+ nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
- if (tee_tg_route4(skb, info)) {
- __this_cpu_write(tee_active, true);
- ip_local_out(skb);
- __this_cpu_write(tee_active, false);
- } else {
- kfree_skb(skb);
- }
return XT_CONTINUE;
}
-#if IS_ENABLED(CONFIG_IPV6)
-static bool
-tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
-{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct net *net = pick_net(skb);
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- if (info->priv) {
- if (info->priv->oif == -1)
- return false;
- fl6.flowi6_oif = info->priv->oif;
- }
- fl6.daddr = info->gw.in6;
- fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
- (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- return false;
- }
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
- skb->dev = dst->dev;
- skb->protocol = htons(ETH_P_IPV6);
- return true;
-}
-
+#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
+ int oif = info->priv ? info->priv->oif : 0;
- if (__this_cpu_read(tee_active))
- return XT_CONTINUE;
- skb = pskb_copy(skb, GFP_ATOMIC);
- if (skb == NULL)
- return XT_CONTINUE;
+ nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
-#ifdef WITH_CONNTRACK
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
-#endif
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_IN) {
- struct ipv6hdr *iph = ipv6_hdr(skb);
- --iph->hop_limit;
- }
- if (tee_tg_route6(skb, info)) {
- __this_cpu_write(tee_active, true);
- ip6_local_out(skb);
- __this_cpu_write(tee_active, false);
- } else {
- kfree_skb(skb);
- }
return XT_CONTINUE;
}
#endif
@@ -251,6 +105,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
} else
info->priv = NULL;
+ static_key_slow_inc(&xt_tee_enabled);
return 0;
}
@@ -262,6 +117,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
unregister_netdevice_notifier(&info->priv->notifier);
kfree(info->priv);
}
+ static_key_slow_dec(&xt_tee_enabled);
}
static struct xt_target tee_tg_reg[] __read_mostly = {
@@ -275,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
{
.name = "TEE",
.revision = 1,
diff --git a/kernel/net/netfilter/xt_TPROXY.c b/kernel/net/netfilter/xt_TPROXY.c
index cca96cec1..3ab591e73 100644
--- a/kernel/net/netfilter/xt_TPROXY.c
+++ b/kernel/net/netfilter/xt_TPROXY.c
@@ -250,8 +250,8 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
* no such listener is found, or NULL if the TCP header is incomplete.
*/
static struct sock *
-tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
- struct sock *sk)
+tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+ __be32 laddr, __be16 lport, struct sock *sk)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr _hdr, *hp;
@@ -267,13 +267,12 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk2 = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -291,7 +290,7 @@ nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
}
static unsigned int
-tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -306,7 +305,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
skb->dev, NFT_LOOKUP_ESTABLISHED);
@@ -318,11 +317,11 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+ sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
skb->dev, NFT_LOOKUP_LISTENER);
@@ -352,7 +351,7 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
- return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
}
static unsigned int
@@ -360,7 +359,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
- return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+ return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
}
#ifdef XT_TPROXY_HAVE_IPV6
@@ -430,15 +429,14 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk2 = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr,
tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
hp->source,
tgi->lport ? tgi->lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk));
- inet_twsk_put(inet_twsk(sk));
+ inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -474,7 +472,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
par->in, NFT_LOOKUP_ESTABLISHED);
@@ -489,7 +487,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ sk = nf_tproxy_get_sock_v6(par->net, tproto,
&iph->saddr, laddr,
hp->source, lport,
par->in, NFT_LOOKUP_LISTENER);
diff --git a/kernel/net/netfilter/xt_addrtype.c b/kernel/net/netfilter/xt_addrtype.c
index fab6eea1b..11d609199 100644
--- a/kernel/net/netfilter/xt_addrtype.c
+++ b/kernel/net/netfilter/xt_addrtype.c
@@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
ret |= XT_ADDRTYPE_LOCAL;
- if (rt->rt6i_flags & RTF_ANYCAST)
+ if (ipv6_anycast_destination((struct dst_entry *)rt, addr))
ret |= XT_ADDRTYPE_ANYCAST;
dst_release(&rt->dst);
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
static bool
addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_addrtype_info *info = par->matchinfo;
const struct iphdr *iph = ip_hdr(skb);
bool ret = true;
@@ -143,7 +143,7 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
static bool
addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_addrtype_info_v1 *info = par->matchinfo;
const struct iphdr *iph;
const struct net_device *dev = NULL;
diff --git a/kernel/net/netfilter/xt_connlabel.c b/kernel/net/netfilter/xt_connlabel.c
index 9f8719df2..bb9cbeb18 100644
--- a/kernel/net/netfilter/xt_connlabel.c
+++ b/kernel/net/netfilter/xt_connlabel.c
@@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
XT_CONNLABEL_OP_SET;
struct xt_connlabel_mtinfo *info = par->matchinfo;
int ret;
- size_t words;
-
- if (info->bit > XT_CONNLABEL_MAXBIT)
- return -ERANGE;
if (info->options & ~options) {
pr_err("Unknown options in mask %x\n", info->options);
@@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return ret;
}
- par->net->ct.labels_used++;
- words = BITS_TO_LONGS(info->bit+1);
- if (words > par->net->ct.label_words)
- par->net->ct.label_words = words;
-
+ ret = nf_connlabels_get(par->net, info->bit + 1);
+ if (ret < 0)
+ nf_ct_l3proto_module_put(par->family);
return ret;
}
static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
{
- par->net->ct.labels_used--;
- if (par->net->ct.labels_used == 0)
- par->net->ct.label_words = 0;
+ nf_connlabels_put(par->net);
nf_ct_l3proto_module_put(par->family);
}
diff --git a/kernel/net/netfilter/xt_connlimit.c b/kernel/net/netfilter/xt_connlimit.c
index 29ba6218a..99bbc8298 100644
--- a/kernel/net/netfilter/xt_connlimit.c
+++ b/kernel/net/netfilter/xt_connlimit.c
@@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head,
static unsigned int check_hlist(struct net *net,
struct hlist_head *head,
const struct nf_conntrack_tuple *tuple,
- u16 zone,
+ const struct nf_conntrack_zone *zone,
bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
@@ -201,7 +201,7 @@ static unsigned int
count_tree(struct net *net, struct rb_root *root,
const struct nf_conntrack_tuple *tuple,
const union nf_inet_addr *addr, const union nf_inet_addr *mask,
- u8 family, u16 zone)
+ u8 family, const struct nf_conntrack_zone *zone)
{
struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
struct rb_node **rbnode, *parent;
@@ -290,7 +290,8 @@ static int count_them(struct net *net,
const struct nf_conntrack_tuple *tuple,
const union nf_inet_addr *addr,
const union nf_inet_addr *mask,
- u_int8_t family, u16 zone)
+ u_int8_t family,
+ const struct nf_conntrack_zone *zone)
{
struct rb_root *root;
int count;
@@ -316,22 +317,22 @@ static int count_them(struct net *net,
static bool
connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
const struct xt_connlimit_info *info = par->matchinfo;
union nf_inet_addr addr;
struct nf_conntrack_tuple tuple;
const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
unsigned int connections;
- u16 zone = NF_CT_DEFAULT_ZONE;
ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL) {
tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
zone = nf_ct_zone(ct);
} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, &tuple)) {
+ par->family, net, &tuple)) {
goto hotdrop;
}
diff --git a/kernel/net/netfilter/xt_ipvs.c b/kernel/net/netfilter/xt_ipvs.c
index 8d47c3780..71a9d95e0 100644
--- a/kernel/net/netfilter/xt_ipvs.c
+++ b/kernel/net/netfilter/xt_ipvs.c
@@ -48,6 +48,7 @@ static bool
ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_ipvs_mtinfo *data = par->matchinfo;
+ struct netns_ipvs *ipvs = net_ipvs(par->net);
/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
const u_int8_t family = par->family;
struct ip_vs_iphdr iph;
@@ -67,7 +68,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
goto out;
}
- ip_vs_fill_iph_skb(family, skb, &iph);
+ ip_vs_fill_iph_skb(family, skb, true, &iph);
if (data->bitmask & XT_IPVS_PROTO)
if ((iph.protocol == data->l4proto) ^
@@ -85,7 +86,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */);
+ cp = pp->conn_out_get(ipvs, family, skb, &iph);
if (unlikely(cp == NULL)) {
match = false;
goto out;
diff --git a/kernel/net/netfilter/xt_mark.c b/kernel/net/netfilter/xt_mark.c
index 233452387..ebd41dc50 100644
--- a/kernel/net/netfilter/xt_mark.c
+++ b/kernel/net/netfilter/xt_mark.c
@@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark");
MODULE_ALIAS("ip6t_mark");
MODULE_ALIAS("ipt_MARK");
MODULE_ALIAS("ip6t_MARK");
+MODULE_ALIAS("arpt_MARK");
static unsigned int
mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
diff --git a/kernel/net/netfilter/xt_nfacct.c b/kernel/net/netfilter/xt_nfacct.c
index 8c646ed9c..3048a7e3a 100644
--- a/kernel/net/netfilter/xt_nfacct.c
+++ b/kernel/net/netfilter/xt_nfacct.c
@@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
struct xt_nfacct_match_info *info = par->matchinfo;
struct nf_acct *nfacct;
- nfacct = nfnl_acct_find_get(info->name);
+ nfacct = nfnl_acct_find_get(par->net, info->name);
if (nfacct == NULL) {
pr_info("xt_nfacct: accounting object with name `%s' "
"does not exists\n", info->name);
diff --git a/kernel/net/netfilter/xt_osf.c b/kernel/net/netfilter/xt_osf.c
index 0778855ea..df8801e02 100644
--- a/kernel/net/netfilter/xt_osf.c
+++ b/kernel/net/netfilter/xt_osf.c
@@ -200,7 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
- struct net *net = dev_net(p->in ? p->in : p->out);
+ struct net *net = p->net;
if (!info)
return false;
diff --git a/kernel/net/netfilter/xt_owner.c b/kernel/net/netfilter/xt_owner.c
index ca2e577ed..1302b475a 100644
--- a/kernel/net/netfilter/xt_owner.c
+++ b/kernel/net/netfilter/xt_owner.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/file.h>
#include <net/sock.h>
+#include <net/inet_sock.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_owner.h>
@@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
+ struct sock *sk = skb_to_full_sk(skb);
- if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
else if (info->match & info->invert & XT_OWNER_SOCKET)
/*
@@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = skb->sk->sk_socket->file;
+ filp = sk->sk_socket->file;
if (filp == NULL)
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
diff --git a/kernel/net/netfilter/xt_recent.c b/kernel/net/netfilter/xt_recent.c
index 45e1b30e4..d725a2774 100644
--- a/kernel/net/netfilter/xt_recent.c
+++ b/kernel/net/netfilter/xt_recent.c
@@ -237,7 +237,7 @@ static void recent_table_flush(struct recent_table *t)
static bool
recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
- struct net *net = dev_net(par->in ? par->in : par->out);
+ struct net *net = par->net;
struct recent_net *recent_net = recent_pernet(net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
diff --git a/kernel/net/netfilter/xt_set.c b/kernel/net/netfilter/xt_set.c
index 89045982e..5669e5b45 100644
--- a/kernel/net/netfilter/xt_set.c
+++ b/kernel/net/netfilter/xt_set.c
@@ -9,14 +9,16 @@
*/
/* Kernel module which implements the set match and SET target
- * for netfilter/iptables. */
+ * for netfilter/iptables.
+ */
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_set.h>
+#include <linux/netfilter/ipset/ip_set.h>
#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -52,6 +54,7 @@ static bool
set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v0 *info = par->matchinfo;
+
ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
info->match_set.u.compat.flags, 0, UINT_MAX);
@@ -68,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info)
info->u.compat.dim = IPSET_DIM_ZERO;
if (info->u.flags[0] & IPSET_MATCH_INV)
info->u.compat.flags |= IPSET_INV_MATCH;
- for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) {
info->u.compat.dim++;
if (info->u.flags[i] & IPSET_SRC)
- info->u.compat.flags |= (1<<info->u.compat.dim);
+ info->u.compat.flags |= (1 << info->u.compat.dim);
}
}
@@ -88,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
info->match_set.index);
return -ENOENT;
}
- if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
pr_warn("Protocol error: set match dimension is over the limit!\n");
ip_set_nfnl_put(par->net, info->match_set.index);
return -ERANGE;
@@ -114,6 +117,7 @@ static bool
set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v1 *info = par->matchinfo;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, 0, UINT_MAX);
@@ -178,9 +182,10 @@ static bool
set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v3 *info = par->matchinfo;
+ int ret;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
- int ret;
if (info->packets.op != IPSET_COUNTER_NONE ||
info->bytes.op != IPSET_COUNTER_NONE)
@@ -224,9 +229,10 @@ static bool
set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_set_info_match_v4 *info = par->matchinfo;
+ int ret;
+
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, info->flags, UINT_MAX);
- int ret;
if (info->packets.op != IPSET_COUNTER_NONE ||
info->bytes.op != IPSET_COUNTER_NONE)
@@ -252,6 +258,7 @@ static unsigned int
set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v0 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
info->add_set.u.compat.flags, 0, UINT_MAX);
ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
@@ -290,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
return -ENOENT;
}
}
- if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
- info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
pr_warn("Protocol error: SET target dimension is over the limit!\n");
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_nfnl_put(par->net, info->add_set.index);
@@ -324,6 +331,7 @@ static unsigned int
set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v1 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, 0, UINT_MAX);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -392,6 +400,7 @@ static unsigned int
set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v2 *info = par->targinfo;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -399,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
/* Normalize to fit into jiffies */
if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
- add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
- add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_add(info->add_set.index, skb, par, &add_opt);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -418,6 +427,8 @@ static unsigned int
set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
+ int ret;
+
ADT_OPT(add_opt, par->family, info->add_set.dim,
info->add_set.flags, info->flags, info->timeout);
ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -425,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
ADT_OPT(map_opt, par->family, info->map_set.dim,
info->map_set.flags, 0, UINT_MAX);
- int ret;
-
/* Normalize to fit into jiffies */
if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
- add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
- add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_add(info->add_set.index, skb, par, &add_opt);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -456,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-
static int
set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
@@ -496,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
!(par->hook_mask & (1 << NF_INET_FORWARD |
1 << NF_INET_LOCAL_OUT |
1 << NF_INET_POST_ROUTING))) {
- pr_warn("mapping of prio or/and queue is allowed only"
- "from OUTPUT/FORWARD/POSTROUTING chains\n");
+ pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
return -EINVAL;
}
index = ip_set_nfnl_get_byindex(par->net,
@@ -518,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
if (info->add_set.dim > IPSET_DIM_MAX ||
info->del_set.dim > IPSET_DIM_MAX ||
info->map_set.dim > IPSET_DIM_MAX) {
- pr_warn("Protocol error: SET target dimension "
- "is over the limit!\n");
+ pr_warn("Protocol error: SET target dimension is over the limit!\n");
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_nfnl_put(par->net, info->add_set.index);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -545,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par)
ip_set_nfnl_put(par->net, info->map_set.index);
}
-
static struct xt_match set_matches[] __read_mostly = {
{
.name = "set",
diff --git a/kernel/net/netfilter/xt_socket.c b/kernel/net/netfilter/xt_socket.c
index e092cb046..2ec08f04b 100644
--- a/kernel/net/netfilter/xt_socket.c
+++ b/kernel/net/netfilter/xt_socket.c
@@ -143,7 +143,8 @@ static bool xt_socket_sk_is_transparent(struct sock *sk)
}
}
-static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb,
+static struct sock *xt_socket_lookup_slow_v4(struct net *net,
+ const struct sk_buff *skb,
const struct net_device *indev)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -197,7 +198,7 @@ static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb,
}
#endif
- return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr,
+ return xt_socket_get_sock_v4(net, protocol, saddr, daddr,
sport, dport, indev);
}
@@ -205,10 +206,11 @@ static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
{
+ struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v4(skb, par->in);
+ sk = xt_socket_lookup_slow_v4(par->net, skb, par->in);
if (sk) {
bool wildcard;
bool transparent = true;
@@ -226,6 +228,10 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = xt_socket_sk_is_transparent(sk);
+ if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
+ transparent)
+ pskb->mark = sk->sk_mark;
+
if (sk != skb->sk)
sock_gen_put(sk);
@@ -247,7 +253,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
}
static bool
-socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
@@ -330,7 +336,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol,
return NULL;
}
-static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb,
+static struct sock *xt_socket_lookup_slow_v6(struct net *net,
+ const struct sk_buff *skb,
const struct net_device *indev)
{
__be16 uninitialized_var(dport), uninitialized_var(sport);
@@ -366,18 +373,19 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb,
return NULL;
}
- return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr,
+ return xt_socket_get_sock_v6(net, tproto, saddr, daddr,
sport, dport, indev);
}
static bool
-socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
if (!sk)
- sk = xt_socket_lookup_slow_v6(skb, par->in);
+ sk = xt_socket_lookup_slow_v6(par->net, skb, par->in);
if (sk) {
bool wildcard;
bool transparent = true;
@@ -395,6 +403,10 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (info->flags & XT_SOCKET_TRANSPARENT)
transparent = xt_socket_sk_is_transparent(sk);
+ if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
+ transparent)
+ pskb->mark = sk->sk_mark;
+
if (sk != skb->sk)
sock_gen_put(sk);
@@ -428,6 +440,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par)
return 0;
}
+static int socket_mt_v3_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo3 *info =
+ (struct xt_socket_mtinfo3 *)par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V3) {
+ pr_info("unknown flags 0x%x\n",
+ info->flags & ~XT_SOCKET_FLAGS_V3);
+ return -EINVAL;
+ }
+ return 0;
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -442,7 +467,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = socket_mt4_v1_v2,
+ .match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -454,7 +479,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV6,
- .match = socket_mt6_v1_v2,
+ .match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -466,7 +491,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV4,
- .match = socket_mt4_v1_v2,
+ .match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -478,7 +503,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV6,
- .match = socket_mt6_v1_v2,
+ .match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -486,6 +511,30 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.me = THIS_MODULE,
},
#endif
+ {
+ .name = "socket",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v1_v2_v3,
+ .checkentry = socket_mt_v3_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1_v2_v3,
+ .checkentry = socket_mt_v3_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init socket_mt_init(void)