From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/net/netfilter/Kconfig | 52 +- kernel/net/netfilter/Makefile | 3 +- kernel/net/netfilter/core.c | 233 +++- kernel/net/netfilter/ipset/ip_set_bitmap_gen.h | 61 +- kernel/net/netfilter/ipset/ip_set_bitmap_ip.c | 58 +- kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c | 119 +- kernel/net/netfilter/ipset/ip_set_bitmap_port.c | 45 +- kernel/net/netfilter/ipset/ip_set_core.c | 404 +++--- kernel/net/netfilter/ipset/ip_set_getport.c | 19 +- kernel/net/netfilter/ipset/ip_set_hash_gen.h | 756 ++++++---- kernel/net/netfilter/ipset/ip_set_hash_ip.c | 72 +- kernel/net/netfilter/ipset/ip_set_hash_ipmark.c | 87 +- kernel/net/netfilter/ipset/ip_set_hash_ipport.c | 98 +- kernel/net/netfilter/ipset/ip_set_hash_ipportip.c | 91 +- kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c | 96 +- kernel/net/netfilter/ipset/ip_set_hash_mac.c | 30 +- kernel/net/netfilter/ipset/ip_set_hash_net.c | 73 +- kernel/net/netfilter/ipset/ip_set_hash_netiface.c | 250 +--- kernel/net/netfilter/ipset/ip_set_hash_netnet.c | 158 ++- kernel/net/netfilter/ipset/ip_set_hash_netport.c | 86 +- .../net/netfilter/ipset/ip_set_hash_netportnet.c | 188 +-- kernel/net/netfilter/ipset/ip_set_list_set.c | 427 +++--- kernel/net/netfilter/ipset/pfxlen.c | 16 +- kernel/net/netfilter/ipvs/Kconfig | 11 + kernel/net/netfilter/ipvs/Makefile | 1 + kernel/net/netfilter/ipvs/ip_vs_app.c | 36 +- kernel/net/netfilter/ipvs/ip_vs_conn.c | 91 +- kernel/net/netfilter/ipvs/ip_vs_core.c | 566 ++++---- kernel/net/netfilter/ipvs/ip_vs_ctl.c | 502 ++++--- kernel/net/netfilter/ipvs/ip_vs_est.c | 20 +- kernel/net/netfilter/ipvs/ip_vs_ftp.c | 27 +- kernel/net/netfilter/ipvs/ip_vs_lblc.c | 3 +- kernel/net/netfilter/ipvs/ip_vs_lblcr.c | 3 +- kernel/net/netfilter/ipvs/ip_vs_nfct.c | 5 +- kernel/net/netfilter/ipvs/ip_vs_ovf.c | 86 ++ kernel/net/netfilter/ipvs/ip_vs_pe_sip.c | 2 +- kernel/net/netfilter/ipvs/ip_vs_proto.c | 33 +- kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 32 +- kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c | 58 +- kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c | 61 +- kernel/net/netfilter/ipvs/ip_vs_proto_udp.c | 49 +- kernel/net/netfilter/ipvs/ip_vs_sched.c | 14 +- kernel/net/netfilter/ipvs/ip_vs_sh.c | 45 +- kernel/net/netfilter/ipvs/ip_vs_sync.c | 374 ++--- kernel/net/netfilter/ipvs/ip_vs_xmit.c | 145 +- kernel/net/netfilter/nf_conntrack_core.c | 197 +-- kernel/net/netfilter/nf_conntrack_expect.c | 22 +- kernel/net/netfilter/nf_conntrack_h323_main.c | 4 +- kernel/net/netfilter/nf_conntrack_labels.c | 34 +- kernel/net/netfilter/nf_conntrack_netlink.c | 331 +++-- kernel/net/netfilter/nf_conntrack_pptp.c | 3 +- kernel/net/netfilter/nf_conntrack_proto_dccp.c | 2 +- kernel/net/netfilter/nf_conntrack_proto_generic.c | 10 +- kernel/net/netfilter/nf_conntrack_proto_gre.c | 3 +- kernel/net/netfilter/nf_conntrack_proto_sctp.c | 103 +- kernel/net/netfilter/nf_conntrack_proto_tcp.c | 2 +- kernel/net/netfilter/nf_conntrack_proto_udp.c | 1 + kernel/net/netfilter/nf_conntrack_proto_udplite.c | 1 + kernel/net/netfilter/nf_conntrack_seqadj.c | 9 +- kernel/net/netfilter/nf_conntrack_standalone.c | 39 +- kernel/net/netfilter/nf_internals.h | 1 + kernel/net/netfilter/nf_log.c | 9 +- kernel/net/netfilter/nf_nat_core.c | 28 +- kernel/net/netfilter/nf_nat_proto_dccp.c | 2 +- kernel/net/netfilter/nf_nat_proto_tcp.c | 2 +- kernel/net/netfilter/nf_nat_proto_udp.c | 2 +- kernel/net/netfilter/nf_nat_proto_udplite.c | 2 +- kernel/net/netfilter/nf_nat_redirect.c | 2 +- kernel/net/netfilter/nf_queue.c | 55 +- kernel/net/netfilter/nf_synproxy_core.c | 24 +- kernel/net/netfilter/nf_tables_api.c | 221 +-- kernel/net/netfilter/nf_tables_core.c | 10 +- kernel/net/netfilter/nf_tables_netdev.c | 256 ++++ kernel/net/netfilter/nfnetlink.c | 54 +- kernel/net/netfilter/nfnetlink_acct.c | 71 +- kernel/net/netfilter/nfnetlink_cttimeout.c | 34 + kernel/net/netfilter/nfnetlink_log.c | 91 +- kernel/net/netfilter/nfnetlink_queue.c | 1444 ++++++++++++++++++++ kernel/net/netfilter/nfnetlink_queue_core.c | 1362 ------------------ kernel/net/netfilter/nfnetlink_queue_ct.c | 113 -- kernel/net/netfilter/nft_compat.c | 26 +- kernel/net/netfilter/nft_counter.c | 124 +- kernel/net/netfilter/nft_ct.c | 1 + kernel/net/netfilter/nft_dynset.c | 5 +- kernel/net/netfilter/nft_limit.c | 188 ++- kernel/net/netfilter/nft_log.c | 3 +- kernel/net/netfilter/nft_meta.c | 44 +- kernel/net/netfilter/nft_payload.c | 57 +- kernel/net/netfilter/nft_queue.c | 2 +- kernel/net/netfilter/nft_reject_inet.c | 19 +- kernel/net/netfilter/x_tables.c | 85 +- kernel/net/netfilter/xt_CT.c | 46 +- kernel/net/netfilter/xt_IDLETIMER.c | 1 + kernel/net/netfilter/xt_LOG.c | 2 +- kernel/net/netfilter/xt_NFLOG.c | 2 +- kernel/net/netfilter/xt_TCPMSS.c | 16 +- kernel/net/netfilter/xt_TCPOPTSTRIP.c | 2 +- kernel/net/netfilter/xt_TEE.c | 168 +-- kernel/net/netfilter/xt_TPROXY.c | 30 +- kernel/net/netfilter/xt_addrtype.c | 6 +- kernel/net/netfilter/xt_connlabel.c | 16 +- kernel/net/netfilter/xt_connlimit.c | 13 +- kernel/net/netfilter/xt_ipvs.c | 5 +- kernel/net/netfilter/xt_mark.c | 1 + kernel/net/netfilter/xt_nfacct.c | 2 +- kernel/net/netfilter/xt_osf.c | 2 +- kernel/net/netfilter/xt_owner.c | 6 +- kernel/net/netfilter/xt_recent.c | 2 +- kernel/net/netfilter/xt_set.c | 47 +- kernel/net/netfilter/xt_socket.c | 73 +- 110 files changed, 6306 insertions(+), 4818 deletions(-) create mode 100644 kernel/net/netfilter/ipvs/ip_vs_ovf.c create mode 100644 kernel/net/netfilter/nf_tables_netdev.c create mode 100644 kernel/net/netfilter/nfnetlink_queue.c delete mode 100644 kernel/net/netfilter/nfnetlink_queue_core.c delete mode 100644 kernel/net/netfilter/nfnetlink_queue_ct.c (limited to 'kernel/net/netfilter') diff --git a/kernel/net/netfilter/Kconfig b/kernel/net/netfilter/Kconfig index a0f3e6a3c..4692782b5 100644 --- a/kernel/net/netfilter/Kconfig +++ b/kernel/net/netfilter/Kconfig @@ -1,6 +1,14 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER +config NETFILTER_INGRESS + bool "Netfilter ingress support" + default y + select NET_INGRESS + help + This allows you to classify packets from ingress using the Netfilter + infrastructure. + config NETFILTER_NETLINK tristate @@ -198,7 +206,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -346,7 +354,7 @@ config NF_CT_NETLINK_HELPER select NETFILTER_NETLINK depends on NF_CT_NETLINK depends on NETFILTER_NETLINK_QUEUE - depends on NETFILTER_NETLINK_QUEUE_CT + depends on NETFILTER_NETLINK_GLUE_CT depends on NETFILTER_ADVANCED help This option enables the user-space connection tracking helpers @@ -354,13 +362,14 @@ config NF_CT_NETLINK_HELPER If unsure, say `N'. -config NETFILTER_NETLINK_QUEUE_CT - bool "NFQUEUE integration with Connection Tracking" - default n - depends on NETFILTER_NETLINK_QUEUE +config NETFILTER_NETLINK_GLUE_CT + bool "NFQUEUE and NFLOG integration with Connection Tracking" + default n + depends on (NETFILTER_NETLINK_QUEUE || NETFILTER_NETLINK_LOG) && NF_CT_NETLINK help - If this option is enabled, NFQUEUE can include Connection Tracking - information together with the packet is the enqueued via NFNETLINK. + If this option is enabled, NFQUEUE and NFLOG can include + Connection Tracking information together with the packet is + the enqueued via NFNETLINK. config NF_NAT tristate @@ -448,6 +457,11 @@ config NF_TABLES_INET help This option enables support for a mixed IPv4/IPv6 "inet" table. +config NF_TABLES_NETDEV + tristate "Netfilter nf_tables netdev tables support" + help + This option enables support for the "netdev" table. + config NFT_EXTHDR tristate "Netfilter nf_tables IPv6 exthdr module" help @@ -710,7 +724,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -852,8 +866,10 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DUP_IPV4 + select NF_DUP_IPV6 if IP6_NF_IPTABLES != n ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. @@ -862,11 +878,11 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful @@ -902,7 +918,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -1114,7 +1130,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1356,10 +1372,10 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `socket' match, which can be used to match packets for which a TCP or UDP socket lookup finds a valid socket. diff --git a/kernel/net/netfilter/Makefile b/kernel/net/netfilter/Makefile index a87d8b8ec..7638c36b4 100644 --- a/kernel/net/netfilter/Makefile +++ b/kernel/net/netfilter/Makefile @@ -10,8 +10,6 @@ obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o -nfnetlink_queue-y := nfnetlink_queue_core.o -nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o @@ -75,6 +73,7 @@ nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/kernel/net/netfilter/core.c b/kernel/net/netfilter/core.c index f0adf700b..10880c89d 100644 --- a/kernel/net/netfilter/core.c +++ b/kernel/net/netfilter/core.c @@ -40,6 +40,9 @@ EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); +DEFINE_PER_CPU(bool, nf_skb_duplicated); +EXPORT_SYMBOL_GPL(nf_skb_duplicated); + int nf_register_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); @@ -58,9 +61,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; -EXPORT_SYMBOL(nf_hooks); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -68,33 +68,168 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -int nf_register_hook(struct nf_hook_ops *reg) +static struct list_head *nf_find_hook_list(struct net *net, + const struct nf_hook_ops *reg) +{ + struct list_head *hook_list = NULL; + + if (reg->pf != NFPROTO_NETDEV) + hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + else if (reg->hooknum == NF_NETDEV_INGRESS) { +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->dev && dev_net(reg->dev) == net) + hook_list = ®->dev->nf_hooks_ingress; +#endif + } + return hook_list; +} + +struct nf_hook_entry { + const struct nf_hook_ops *orig_ops; + struct nf_hook_ops ops; +}; + +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *hook_list; + struct nf_hook_entry *entry; struct nf_hook_ops *elem; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->orig_ops = reg; + entry->ops = *reg; + + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) { + kfree(entry); + return -ENOENT; + } + mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + list_for_each_entry(elem, hook_list, list) { if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, elem->list.prev); + list_add_rcu(&entry->ops.list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_inc_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } -EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *hook_list; + struct nf_hook_entry *entry; + struct nf_hook_ops *elem; + + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) + return; + mutex_lock(&nf_hook_mutex); - list_del_rcu(®->list); + list_for_each_entry(elem, hook_list, list) { + entry = container_of(elem, struct nf_hook_entry, ops); + if (entry->orig_ops == reg) { + list_del_rcu(&entry->ops.list); + break; + } + } mutex_unlock(&nf_hook_mutex); + if (&elem->list == hook_list) { + WARN(1, "nf_unregister_net_hook: hook not found!\n"); + return; + } +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); + nf_queue_nf_hook_drop(net, &entry->ops); + /* other cpu might still process nfqueue verdict that used reg */ + synchronize_net(); + kfree(entry); +} +EXPORT_SYMBOL(nf_unregister_net_hook); + +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_net_hook(net, ®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_net_hooks(net, reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_net_hooks); + +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + while (n-- > 0) + nf_unregister_net_hook(net, ®[n]); +} +EXPORT_SYMBOL(nf_unregister_net_hooks); + +static LIST_HEAD(nf_hook_list); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct net *net, *last; + int ret; + + rtnl_lock(); + for_each_net(net) { + ret = nf_register_net_hook(net, reg); + if (ret && ret != -ENOENT) + goto rollback; + } + list_add_tail(®->list, &nf_hook_list); + rtnl_unlock(); + + return 0; +rollback: + last = net; + for_each_net(net) { + if (net == last) + break; + nf_unregister_net_hook(net, reg); + } + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + struct net *net; + + rtnl_lock(); + list_del(®->list); + for_each_net(net) + nf_unregister_net_hook(net, reg); + rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -142,7 +277,7 @@ unsigned int nf_iterate(struct list_head *head, /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook(*elemp, skb, state); + verdict = (*elemp)->hook((*elemp)->priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) @@ -172,11 +307,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = list_entry_rcu(&nf_hooks[state->pf][state->hook], - struct nf_hook_ops, list); + elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state, - &elem); + verdict = nf_iterate(state->hook_list, skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -188,8 +321,6 @@ next_hook: int err = nf_queue(skb, elem, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { - if (err == -ECANCELED) - goto next_hook; if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; @@ -223,6 +354,12 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) } EXPORT_SYMBOL(skb_make_writable); +/* This needs to be compiled in any case to avoid dependencies between the + * nfnetlink_queue code and nf_conntrack. + */ +struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nfnl_ct_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence @@ -260,12 +397,12 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); -struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_hook); - -struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); - +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED @@ -273,8 +410,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int nf_register_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + int ret; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) { + ret = nf_register_net_hook(net, elem); + if (ret && ret != -ENOENT) + goto out_undo; + } + rtnl_unlock(); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); + return ret; +} + +static void nf_unregister_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); +} + static int __net_init netfilter_net_init(struct net *net) { + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&net->nf.hooks[i][h]); + } + #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -285,11 +460,16 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - return 0; + ret = nf_register_hook_list(net); + if (ret) + remove_proc_entry("netfilter", net->proc_net); + + return ret; } static void __net_exit netfilter_net_exit(struct net *net) { + nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } @@ -300,12 +480,7 @@ static struct pernet_operations netfilter_net_ops = { int __init netfilter_init(void) { - int i, h, ret; - - for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } + int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h index 6f024a8a1..b0bc475f6 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -33,7 +33,7 @@ #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype MTYPE -#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) +#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) @@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct mtype *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set) del_timer_sync(&map->gc); ip_set_free(map->members); - if (set->dsize) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set); - ip_set_free(map->extensions); - } - kfree(map); + if (set->dsize && set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map); set->data = NULL; } @@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; + size_t memsize = sizeof(*map) + map->memsize; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - map->memsize + - set->dsize * map->elements))) + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -144,10 +139,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(x, set))) + ip_set_timeout_expired(ext_timeout(x, set))) { ret = 0; - else if (!(flags & IPSET_FLAG_EXIST)) + } else if (!(flags & IPSET_FLAG_EXIST)) { + set_bit(e->id, map->members); return -IPSET_ERR_EXIST; + } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } @@ -165,6 +162,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_comment(ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); + + /* Activate element */ + set_bit(e->id, map->members); + return 0; } @@ -203,10 +204,13 @@ mtype_list(const struct ip_set *set, struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; + int ret = 0; adt = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; + /* Extensions may be replaced */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { id = cb->args[IPSET_CB_ARG0]; @@ -214,7 +218,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT - mtype_is_filled((const struct mtype_elem *) x) && + mtype_is_filled((const struct mtype_elem *)x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; @@ -222,14 +226,16 @@ mtype_list(const struct ip_set *set, if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + + goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, - mtype_is_filled((const struct mtype_elem *) x))) + mtype_is_filled((const struct mtype_elem *)x))) goto nla_put_failure; ipset_nest_end(skb, nested); } @@ -238,29 +244,32 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } ipset_nest_end(skb, adt); - return 0; +out: + rcu_read_unlock(); + return ret; } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct mtype *map = set->data; void *x; u32 id; /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); + * but adding/deleting new entries is locked out + */ + spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); @@ -269,7 +278,7 @@ mtype_gc(unsigned long ul_set) ip_set_ext_destroy(set, x); } } - read_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c index 55b083ec5..4783efff0 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -36,11 +36,11 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); #define MTYPE bitmap_ip +#define HOST_MASK 32 /* Type structure */ struct bitmap_ip { void *members; /* the set members */ - void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -48,6 +48,8 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -58,7 +60,7 @@ struct bitmap_ip_adt_elem { static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { - return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; } /* Common functions */ @@ -80,7 +82,7 @@ static inline int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -137,20 +139,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -174,11 +173,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else + } else { ip_to = ip; + } if (ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -189,8 +189,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -225,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -277,16 +270,17 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - if (netmask > 32) + if (netmask > HOST_MASK) return -IPSET_ERR_INVALID_NETMASK; first_ip &= ip_set_hostmask(netmask); @@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], pr_debug("hosts %u, elements %llu\n", hosts, (unsigned long long)elements); - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ip; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); @@ -360,7 +354,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -377,6 +372,7 @@ bitmap_ip_init(void) static void __exit bitmap_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 86104744b..29dde2083 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac +#define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { @@ -46,24 +47,26 @@ enum { /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ - void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + unsigned char extensions[0] /* MAC + data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { + unsigned char ether[ETH_ALEN] __aligned(2); u16 id; - unsigned char *ether; + u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; -} __attribute__ ((aligned)); +} __aligned(__alignof__(u64)); static inline u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) @@ -71,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) return ip - m->first_ip; } -static inline struct bitmap_ipmac_elem * -get_elem(void *extensions, u16 id, size_t dsize) -{ - return (struct bitmap_ipmac_elem *)(extensions + id * dsize); -} +#define get_elem(extensions, id, dsize) \ + (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) + +#define get_const_elem(extensions, id, dsize) \ + (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ @@ -87,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, if (!test_bit(e->id, map->members)) return 0; - elem = get_elem(map->extensions, e->id, dsize); - if (elem->filled == MAC_FILLED) - return e->ether == NULL || - ether_addr_equal(e->ether, elem->ether); + elem = get_const_elem(map->extensions, e->id, dsize); + if (e->add_mac && elem->filled == MAC_FILLED) + return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } @@ -102,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) if (!test_bit(id, map->members)) return 0; - elem = get_elem(map->extensions, id, dsize); + elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } @@ -130,8 +132,9 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, - * possibly by the kernel */ - if (e->ether) + * possibly by the kernel + */ + if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; @@ -146,28 +149,35 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); - if (test_and_set_bit(e->id, map->members)) { + if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && (flags & IPSET_FLAG_EXIST)) - memcpy(elem->ether, e->ether, ETH_ALEN); + if (e->add_mac && + (flags & IPSET_FLAG_EXIST) && + !ether_addr_equal(e->ether, elem->ether)) { + /* memcpy isn't atomic */ + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); + } return IPSET_ADD_FAILED; - } else if (!e->ether) + } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ - memcpy(elem->ether, e->ether, ETH_ALEN); + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; - } else if (e->ether) { + } else if (e->add_mac) { /* We can store MAC too */ - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; - } else { - elem->filled = MAC_UNSET; - /* MAC is not stored yet, don't start timer */ - return IPSET_ADD_STORE_PLAIN_TIMEOUT; } + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static inline int @@ -182,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = - get_elem(map->extensions, id, dsize); + get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || @@ -204,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = { .id = 0 }; + struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -222,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; e.id = ip_to_id(map, ip); - e.ether = eth_hdr(skb)->h_source; + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -238,20 +248,17 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -259,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); - if (tb[IPSET_ATTR_ETHER]) - e.ether = nla_data(tb[IPSET_ATTR_ETHER]); - else - e.ether = NULL; - + if (tb[IPSET_ATTR_ETHER]) { + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + e.add_mac = 1; + } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; @@ -294,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -343,25 +342,27 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem), + __alignof__(struct bitmap_ipmac_elem)); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ipmac; - set->dsize = ip_set_elem_len(set, tb, - sizeof(struct bitmap_ipmac_elem)); if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; @@ -397,7 +398,8 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -414,6 +416,7 @@ bitmap_ipmac_init(void) static void __exit bitmap_ipmac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c index 005dd3644..7f0c73335 100644 --- a/kernel/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/kernel/net/netfilter/ipset/ip_set_bitmap_port.c @@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { void *members; /* the set members */ - void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -73,7 +74,7 @@ static inline int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -136,19 +137,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], u16 port_to; int ret = 0; - if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) + return -IPSET_ERR_PROTOCOL; + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -168,8 +163,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (port < map->first_port) return -IPSET_ERR_BITMAP_RANGE; } - } else + } else { port_to = port; + } if (port_to > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -180,8 +176,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -214,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * map->elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_port = first_port; map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; @@ -237,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map; u16 first_port, last_port; + u32 elements; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -253,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], last_port = tmp; } - map = kzalloc(sizeof(*map), GFP_KERNEL); + elements = last_port - first_port + 1; + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; - map->elements = last_port - first_port + 1; + map->elements = elements; map->memsize = bitmap_bytes(0, map->elements); set->variant = &bitmap_port; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; @@ -294,7 +285,8 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -311,6 +303,7 @@ bitmap_port_init(void) static void __exit bitmap_port_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_port_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_core.c b/kernel/net/netfilter/ipset/ip_set_core.c index d259da3ce..54f3d7cb2 100644 --- a/kernel/net/netfilter/ipset/ip_set_core.c +++ b/kernel/net/netfilter/ipset/ip_set_core.c @@ -32,8 +32,10 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ - int is_deleted; /* deleted by ip_set_net_exit */ + bool is_deleted; /* deleted by ip_set_net_exit */ + bool is_destroyed; /* all sets are destroyed */ }; + static int ip_set_net_id __read_mostly; static inline struct ip_set_net *ip_set_pernet(struct net *net) @@ -42,7 +44,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net) } #define IP_SET_INC 64 -#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) +#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -59,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] -/* - * The set types are implemented in modules and registered set types +/* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ @@ -85,7 +86,7 @@ find_set_type(const char *name, u8 family, u8 revision) struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && @@ -130,9 +131,10 @@ __find_set_type_get(const char *name, u8 family, u8 revision, goto unlock; } /* Make sure the type is already loaded - * but we don't support the revision */ + * but we don't support the revision + */ list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name)) { + if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } @@ -166,7 +168,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; @@ -208,15 +210,15 @@ ip_set_type_register(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); - ret = -EINVAL; - goto unlock; + ip_set_type_unlock(); + return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); -unlock: ip_set_type_unlock(); + return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); @@ -230,12 +232,12 @@ ip_set_type_unregister(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); - goto unlock; + ip_set_type_unlock(); + return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); -unlock: ip_set_type_unlock(); synchronize_rcu(); @@ -289,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -306,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -317,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); @@ -362,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) } size_t -ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, + size_t align) { enum ip_set_ext_id id; - size_t offset = 0; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; + if (!align) + align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset += ALIGN(len + offset, ip_set_extensions[id].align); - set->offset[id] = offset; + len = ALIGN(len, ip_set_extensions[id].align); + set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; - offset += ip_set_extensions[id].len; + len += ip_set_extensions[id].len; } - return len + offset; + return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); @@ -389,13 +393,22 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + if (tb[IPSET_ATTR_TIMEOUT]) { - if (!(set->extensions & IPSET_EXT_TIMEOUT)) + if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { - if (!(set->extensions & IPSET_EXT_COUNTER)) + if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( @@ -405,25 +418,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { - if (!(set->extensions & IPSET_EXT_COMMENT)) + if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbmark = fullmark >> 32; ext->skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbprio = be32_to_cpu(nla_get_be32( tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbqueue = be16_to_cpu(nla_get_be16( tb[IPSET_ATTR_SKBQUEUE])); @@ -432,8 +445,32 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -/* - * Creating/destroying/renaming/swapping affect the existence and +int +ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, + const void *e, bool active) +{ + if (SET_WITH_TIMEOUT(set)) { + unsigned long *timeout = ext_timeout(e, set); + + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(active ? ip_set_timeout_get(timeout) + : *timeout))) + return -EMSGSIZE; + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, set))) + return -EMSGSIZE; + if (SET_WITH_COMMENT(set) && + ip_set_put_comment(skb, ext_comment(e, set))) + return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_put_extensions); + +/* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * @@ -460,8 +497,7 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -/* - * Add, del and test set entries from kernel. +/* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. @@ -485,27 +521,26 @@ int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -524,20 +559,19 @@ int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } @@ -547,27 +581,25 @@ int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { - struct ip_set *set = ip_set_rcu_get( - dev_net(par->in ? par->in : par->out), index); + struct ip_set *set = ip_set_rcu_get(par->net, index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); -/* - * Find set by name, reference it once. The reference makes sure the +/* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ @@ -581,7 +613,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s != NULL && STREQ(s->name, name)) { + if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; @@ -594,8 +626,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) } EXPORT_SYMBOL_GPL(ip_set_get_byname); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -608,7 +639,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; - if (set != NULL) + if (set) __ip_set_put(set); rcu_read_unlock(); } @@ -622,8 +653,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_put_byindex); -/* - * Get the name of a set behind a set index. +/* Get the name of a set behind a set index. * We assume the set is referenced, so it does exist and * can't be destroyed. The set cannot be renamed due to * the referencing either. @@ -634,7 +664,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) { const struct ip_set *set = ip_set_rcu_get(net, index); - BUG_ON(set == NULL); + BUG_ON(!set); BUG_ON(set->ref == 0); /* Referenced, so it's safe */ @@ -642,13 +672,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_name_byindex); -/* - * Routines to call by external subsystems, which do not +/* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ -/* - * Find set by index, reference it once. The reference makes sure the +/* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. @@ -674,8 +702,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -690,15 +717,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); - if (set != NULL) + if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); -/* - * Communication protocol with userspace over netlink. +/* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ @@ -725,7 +751,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), sizeof(*nfmsg), flags); - if (nlh == NULL) + if (!nlh) return NULL; nfmsg = nlmsg_data(nlh); @@ -758,7 +784,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL && STREQ(set->name, name)) { + if (set && STRNCMP(set->name, name)) { *id = i; break; } @@ -784,10 +810,10 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s == NULL) { + if (!s) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, s->name)) { + } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; @@ -816,18 +842,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; - struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(nlh); int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_REVISION] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL || - (attr[IPSET_ATTR_DATA] != NULL && + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_REVISION] || + !attr[IPSET_ATTR_FAMILY] || + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; @@ -838,33 +864,29 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); - /* - * First, and without any locks, allocate and initialize + /* First, and without any locks, allocate and initialize * a normal base set structure. */ - set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; - rwlock_init(&set->lock); + spin_lock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; - /* - * Next, check that we know the type, and take + /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ - ret = find_set_type_get(typename, family, revision, &(set->type)); + ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; - /* - * Without holding any locks, create private part. - */ + /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy)) { @@ -878,8 +900,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* BTW, ret==0 here. */ - /* - * Here, we have a valid, constructed set and we are protected + /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ @@ -887,7 +908,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && - STREQ(set->type->name, clash->type->name) && + STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && @@ -902,7 +923,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* Wraparound */ goto cleanup; - list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -916,12 +937,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, inst->ip_set_max = i; kfree(tmp); ret = 0; - } else if (ret) + } else if (ret) { goto cleanup; + } - /* - * Finally! Add our shiny new set to the list, and be done. - */ + /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; @@ -946,12 +966,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static void -ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +ip_set_destroy_set(struct ip_set *set) { - struct ip_set *set = ip_set(inst, index); - pr_debug("set: %s\n", set->name); - ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -986,30 +1003,36 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && s->ref) { + if (s && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } } + inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) - ip_set_destroy_set(inst, i); + if (s) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(s); + } } + /* Modified by ip_set_destroy() only, which is serialized */ + inst->is_destroyed = false; } else { s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); - if (s == NULL) { + if (!s) { ret = -ENOENT; goto out; } else if (s->ref) { ret = -IPSET_ERR_BUSY; goto out; } + ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - ip_set_destroy_set(inst, i); + ip_set_destroy_set(s); } return 0; out: @@ -1024,9 +1047,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->flush(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); } static int @@ -1044,12 +1067,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) + if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (s == NULL) + if (!s) return -ENOENT; ip_set_flush_set(s); @@ -1081,12 +1104,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; read_lock_bh(&ip_set_ref_lock); @@ -1098,7 +1121,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && STREQ(s->name, name2)) { + if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -1130,23 +1153,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); - if (from == NULL) + if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); - if (to == NULL) + if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. - * Not an artificial restriction anymore, as we must prevent - * possible loops created by swapping in setlist type of sets. */ + * Not an artifical restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. + */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; @@ -1177,12 +1201,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, static int ip_set_dump_done(struct netlink_callback *cb) { - struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", - ip_set(inst, cb->args[IPSET_CB_INDEX])->name); - __ip_set_put_byindex(inst, - (ip_set_id_t) cb->args[IPSET_CB_INDEX]); + struct ip_set_net *inst = + (struct ip_set_net *)cb->args[IPSET_CB_NET]; + ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + struct ip_set *set = ip_set(inst, index); + + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); + __ip_set_put_byindex(inst, index); } return 0; } @@ -1204,7 +1232,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; @@ -1213,27 +1241,23 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); - /* cb->args[IPSET_CB_NET]: net namespace - * [IPSET_CB_DUMP]: dump single set/all sets - * [IPSET_CB_INDEX]: set index - * [IPSET_CB_ARG0]: type specific - */ - if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (set == NULL) + if (!set) return -ENOENT; dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; - } else + } else { dump_type = DUMP_ALL; + } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; @@ -1251,6 +1275,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; + bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) { @@ -1258,7 +1283,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) if (ret < 0) { nlh = nlmsg_hdr(cb->skb); /* We have to create and send the error message - * manually :-( */ + * manually :-( + */ if (nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(cb->skb, nlh, ret); return ret; @@ -1276,13 +1302,21 @@ dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { - index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); - if (set == NULL) { + is_destroyed = inst->is_destroyed; + if (!set || is_destroyed) { + write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } + if (is_destroyed) { + /* All sets are just being destroyed */ + ret = 0; + goto out; + } continue; } /* When dumping all sets, we must dump "sorted" @@ -1290,14 +1324,17 @@ dump_last: */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == - !!(set->type->features & IPSET_DUMP_LAST))) + !!(set->type->features & IPSET_DUMP_LAST))) { + write_unlock_bh(&ip_set_ref_lock); continue; + } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(set); + set->ref++; } + write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); @@ -1325,11 +1362,13 @@ dump_last: goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; + if (set->variant->uref) + set->variant->uref(set, cb, true); /* Fall through and add elements */ default: - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->list(set, skb, cb); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; @@ -1341,6 +1380,8 @@ dump_last: dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; + if (set && set->variant->uref) + set->variant->uref(set, cb, false); goto dump_last; } goto out; @@ -1355,7 +1396,10 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", ip_set(inst, index)->name); + set = ip_set(inst, index); + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } @@ -1407,9 +1451,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); retried = true; } while (ret == -EAGAIN && set->variant->resize && @@ -1425,12 +1469,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); @@ -1447,7 +1491,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1462,25 +1507,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1517,25 +1562,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1572,26 +1617,26 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_DATA] == NULL || + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy)) return -IPSET_ERR_PROTOCOL; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; @@ -1613,15 +1658,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL)) + !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1670,8 +1715,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL)) + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); @@ -1681,7 +1726,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1726,11 +1771,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh2; int ret = 0; - if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1858,7 +1903,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned int *) data; + op = (unsigned int *)data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -1970,10 +2015,11 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; - inst->is_deleted = 0; + inst->is_deleted = false; + inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } @@ -1986,12 +2032,14 @@ ip_set_net_exit(struct net *net) struct ip_set *set = NULL; ip_set_id_t i; - inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + inst->is_deleted = true; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL) - ip_set_destroy_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(set); + } } kfree(rcu_dereference_protected(inst->ip_set_list, 1)); } @@ -2003,11 +2051,11 @@ static struct pernet_operations ip_set_net_ops = { .size = sizeof(struct ip_set_net) }; - static int __init ip_set_init(void) { int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); return ret; diff --git a/kernel/net/netfilter/ipset/ip_set_getport.c b/kernel/net/netfilter/ipset/ip_set_getport.c index 29fb01ddf..42c3e3ba1 100644 --- a/kernel/net/netfilter/ipset/ip_set_getport.c +++ b/kernel/net/netfilter/ipset/ip_set_getport.c @@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct tcphdr *th; th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); - if (th == NULL) + if (!th) /* No choice either */ return false; @@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const sctp_sctphdr_t *sh; sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); - if (sh == NULL) + if (!sh) /* No choice either */ return false; @@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct udphdr *uh; uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); - if (uh == NULL) + if (!uh) /* No choice either */ return false; @@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmphdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16)htons((ic->type << 8) | ic->code); @@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmp6hdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16) @@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto) { const struct iphdr *iph = ip_hdr(skb); - unsigned int protooff = ip_hdrlen(skb); + unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb); int protocol = iph->protocol; /* See comments at tcp_match in ip_tables.c */ @@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, return false; default: /* Other protocols doesn't have ports, - so we can match fragments */ + * so we can match fragments. + */ *proto = protocol; return true; } @@ -135,7 +136,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 frag_off = 0; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + protoff = ipv6_skip_exthdr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return false; diff --git a/kernel/net/netfilter/ipset/ip_set_hash_gen.h b/kernel/net/netfilter/ipset/ip_set_hash_gen.h index 974ff386d..e5336ab36 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_gen.h +++ b/kernel/net/netfilter/ipset/ip_set_hash_gen.h @@ -10,19 +10,19 @@ #include #include +#include #include -#ifndef rcu_dereference_bh -#define rcu_dereference_bh(p) rcu_dereference(p) -#endif + +#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) +#define ipset_dereference_protected(p, set) \ + __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. * Internally jhash is used with the assumption that the size of the - * stored data is a multiple of sizeof(u32). If storage supports timeout, - * the timeout field must be the last one in the data structure - that field - * is ignored when computing the hash key. + * stored data is a multiple of sizeof(u32). * * Readers and resizing * @@ -35,7 +35,9 @@ /* Number of elements to store in an initial array block */ #define AHASH_INIT_SIZE 4 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +/* Max muber of elements in the array block when tuned */ +#define AHASH_MAX_TUNED 64 /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI @@ -53,8 +55,9 @@ tune_ahash_max(u8 curr, u32 multi) /* Currently, at listing one hash bucket must fit into a message. * Therefore we have a hard limit here. */ - return n > curr && n <= 64 ? n : curr; + return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } + #define TUNE_AHASH_MAX(h, multi) \ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) #else @@ -64,18 +67,24 @@ tune_ahash_max(u8 curr, u32 multi) /* A hash bucket */ struct hbucket { - void *value; /* the array of the values */ + struct rcu_head rcu; /* for call_rcu_bh */ + /* Which positions are used in the array */ + DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ + unsigned char value[0] /* the array of the values */ + __aligned(__alignof__(u64)); }; /* The hash table: the table size stored here in order to make resizing easy */ struct htable { + atomic_t ref; /* References for resizing */ + atomic_t uref; /* References for dumping */ u8 htable_bits; /* size of hash table == 2^htable_bits */ - struct hbucket bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; -#define hbucket(h, i) (&((h)->bucket[i])) +#define hbucket(h, i) ((h)->bucket[i]) #ifndef IPSET_NET_COUNT #define IPSET_NET_COUNT 1 @@ -83,8 +92,8 @@ struct htable { /* Book-keeping of the prefixes added to the set */ struct net_prefixes { - u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ - u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ + u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the cidr value */ }; /* Compute the hash table size */ @@ -97,11 +106,11 @@ htable_size(u8 hbits) if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; - return hsize * sizeof(struct hbucket) + sizeof(struct htable); + return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } /* Compute htable_bits from the user input parameter hashsize */ @@ -110,6 +119,7 @@ htable_bits(u32 hashsize) { /* Assume that hashsize == 2^htable_bits */ u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) /* Round up to the first 2^n value */ bits = fls(hashsize); @@ -117,30 +127,6 @@ htable_bits(u32 hashsize) return bits; } -static int -hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) -{ - if (n->pos >= n->size) { - void *tmp; - - if (n->size >= ahash_max) - /* Trigger rehashing */ - return -EAGAIN; - - tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); - if (!tmp) - return -ENOMEM; - if (n->size) { - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - } - n->value = tmp; - n->size += AHASH_INIT_SIZE; - } - return 0; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -149,23 +135,31 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #endif /* cidr + 1 is stored in net_prefixes to support /0 */ -#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1) +#define NCIDR_PUT(cidr) ((cidr) + 1) +#define NCIDR_GET(cidr) ((cidr) - 1) #ifdef IP_SET_HASH_WITH_NETS_PACKED /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */ -#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1) -#define NCIDR(cidr) (cidr) +#define DCIDR_PUT(cidr) ((cidr) - 1) +#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1) #else -#define GCIDR(cidr, i) (__CIDR(cidr, i)) -#define NCIDR(cidr) (cidr - 1) +#define DCIDR_PUT(cidr) (cidr) +#define DCIDR_GET(cidr, i) __CIDR(cidr, i) #endif +#define INIT_CIDR(cidr, host_mask) \ + DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) + #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 +/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) (SET_HOST_MASK(family) + 1) +#define CIDR_POS(c) ((c) - 1) #else +/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) SET_HOST_MASK(family) +#define CIDR_POS(c) ((c) - 2) #endif #else @@ -180,6 +174,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_data_equal #undef mtype_do_data_match #undef mtype_data_set_flags +#undef mtype_data_reset_elem #undef mtype_data_reset_flags #undef mtype_data_netmask #undef mtype_data_list @@ -193,7 +188,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_ahash_memsize #undef mtype_flush #undef mtype_destroy -#undef mtype_gc_init #undef mtype_same_set #undef mtype_kadt #undef mtype_uadt @@ -203,6 +197,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_del #undef mtype_test_cidrs #undef mtype_test +#undef mtype_uref #undef mtype_expire #undef mtype_resize #undef mtype_head @@ -227,6 +222,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) + #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) @@ -234,7 +230,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) -#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) @@ -244,23 +239,36 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_uref IPSET_TOKEN(MTYPE, _uref) #define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) +#ifndef MTYPE +#error "MTYPE is not defined!" +#endif + +#ifndef HOST_MASK +#error "HOST_MASK is not defined!" +#endif + #ifndef HKEY_DATALEN #define HKEY_DATALEN sizeof(struct mtype_elem) #endif #define HKEY(data, initval, htable_bits) \ -(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ +(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \ & jhash_mask(htable_bits)) #ifndef htype +#ifndef HTYPE +#error "HTYPE is not defined!" +#endif /* HTYPE */ #define htype HTYPE /* The generic hash structure */ @@ -280,18 +288,16 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif -#ifdef IP_SET_HASH_WITH_RBTREE - struct rb_root rbtree; -#endif #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[0]; /* book-keeping of prefixes */ #endif }; -#endif +#endif /* htype */ #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different - * sized networks */ + * sized networks. cidr == real cidr + 1 to support /0. + */ static void mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) { @@ -299,12 +305,12 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) { - if (j != -1) + if (j != -1) { continue; - else if (h->nets[i].cidr[n] < cidr) + } else if (h->nets[i].cidr[n] < cidr) { j = i; - else if (h->nets[i].cidr[n] == cidr) { - h->nets[cidr - 1].nets[n]++; + } else if (h->nets[i].cidr[n] == cidr) { + h->nets[CIDR_POS(cidr)].nets[n]++; return; } } @@ -313,7 +319,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; - h->nets[cidr - 1].nets[n] = 1; + h->nets[CIDR_POS(cidr)].nets[n] = 1; } static void @@ -322,15 +328,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) u8 i, j, net_end = nets_length - 1; for (i = 0; i < nets_length; i++) { - if (h->nets[i].cidr[n] != cidr) - continue; - h->nets[cidr -1].nets[n]--; - if (h->nets[cidr -1].nets[n] > 0) - return; + if (h->nets[i].cidr[n] != cidr) + continue; + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) + return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) - h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + return; } } #endif @@ -341,15 +347,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t, u8 nets_length, size_t dsize) { u32 i; - size_t memsize = sizeof(*h) - + sizeof(*t) + struct hbucket *n; + size_t memsize = sizeof(*h) + sizeof(*t); + #ifdef IP_SET_HASH_WITH_NETS - + sizeof(struct net_prefixes) * nets_length + memsize += sizeof(struct net_prefixes) * nets_length; #endif - + jhash_size(t->htable_bits) * sizeof(struct hbucket); - - for (i = 0; i < jhash_size(t->htable_bits); i++) - memsize += t->bucket[i].size * dsize; + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + memsize += sizeof(struct hbucket) + n->size * dsize; + } return memsize; } @@ -364,7 +373,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) int i; for (i = 0; i < n->pos; i++) - ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); + if (test_bit(i, n->used)) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } /* Flush a hash type of set: destroy all elements */ @@ -376,16 +386,16 @@ mtype_flush(struct ip_set *set) struct hbucket *n; u32 i; - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - n->size = n->pos = 0; - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); @@ -401,13 +411,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n); } ip_set_free(t); @@ -419,13 +429,11 @@ mtype_destroy(struct ip_set *set) { struct htype *h = set->data; - if (set->extensions & IPSET_EXT_TIMEOUT) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); - mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); -#ifdef IP_SET_HASH_WITH_RBTREE - rbtree_destroy(&h->rbtree); -#endif + mtype_ahash_destroy(set, + __ipset_dereference_protected(h->table, 1), true); kfree(h); set->data = NULL; @@ -437,7 +445,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct htype *h = set->data; init_timer(&h->gc); - h->gc.data = (unsigned long) set; + h->gc.data = (unsigned long)set; h->gc.function = gc; h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -468,63 +476,78 @@ static void mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) { struct htable *t; - struct hbucket *n; + struct hbucket *n, *tmp; struct mtype_elem *data; - u32 i; - int j; + u32 i, j, d; #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - for (j = 0; j < n->pos; j++) { + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) { + d++; + continue; + } data = ahash_data(n, j, dsize); if (ip_set_timeout_expired(ext_timeout(data, set))) { pr_debug("expired %u/%u\n", i, j); + clear_bit(j, n->used); + smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, SCIDR(data->cidr, k), - nets_length, k); + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, + k)), + nets_length, k); #endif ip_set_ext_destroy(set, data); - if (j != n->pos - 1) - /* Not last one */ - memcpy(data, - ahash_data(n, n->pos - 1, dsize), - dsize); - n->pos--; h->elements--; + d++; } } - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * dsize, - GFP_ATOMIC); + if (d >= AHASH_INIT_SIZE) { + if (d >= n->size) { + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + continue; + } + tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements */ continue; - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - n->value = tmp; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + d * dsize, data, dsize); + set_bit(d, tmp->used); + d++; + } + tmp->pos = d; + rcu_assign_pointer(hbucket(t, i), tmp); + kfree_rcu(n, rcu); } } - rcu_read_unlock_bh(); } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct htype *h = set->data; pr_debug("called\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); mtype_expire(set, h, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -532,93 +555,152 @@ mtype_gc(unsigned long ul_set) /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or - * fail due to memory pressures. */ + * fail due to memory pressures. + */ static int mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; - struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); - u8 htable_bits = orig->htable_bits; + struct htable *t, *orig; + u8 htable_bits; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; + struct mtype_elem *tmp; #endif struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j; + u32 i, j, key; int ret; - /* Try to cleanup once */ - if (SET_WITH_TIMEOUT(set) && !retried) { - i = h->elements; - write_lock_bh(&set->lock); - mtype_expire(set, set->data, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); - if (h->elements < i) - return 0; - } +#ifdef IP_SET_HASH_WITH_NETS + tmp = kmalloc(dsize, GFP_KERNEL); + if (!tmp) + return -ENOMEM; +#endif + rcu_read_lock_bh(); + orig = rcu_dereference_bh_nfnl(h->table); + htable_bits = orig->htable_bits; + rcu_read_unlock_bh(); retry: ret = 0; htable_bits++; - pr_debug("attempt to resize set %s from %u to %u, t %p\n", - set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ pr_warn("Cannot increase the hashsize of set %s further\n", set->name); - return -IPSET_ERR_HASH_FULL; + ret = -IPSET_ERR_HASH_FULL; + goto out; + } + t = ip_set_alloc(htable_size(htable_bits)); + if (!t) { + ret = -ENOMEM; + goto out; } - t = ip_set_alloc(sizeof(*t) - + jhash_size(htable_bits) * sizeof(struct hbucket)); - if (!t) - return -ENOMEM; t->htable_bits = htable_bits; - read_lock_bh(&set->lock); + spin_lock_bh(&set->lock); + orig = __ipset_dereference_protected(h->table, 1); + /* There can't be another parallel resizing, but dumping is possible */ + atomic_set(&orig->ref, 1); + atomic_inc(&orig->uref); + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = hbucket(orig, i); + n = __ipset_dereference_protected(hbucket(orig, i), 1); + if (!n) + continue; for (j = 0; j < n->pos; j++) { - data = ahash_data(n, j, set->dsize); + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); #ifdef IP_SET_HASH_WITH_NETS + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ flags = 0; + memcpy(tmp, data, dsize); + data = tmp; mtype_data_reset_flags(data, &flags); #endif - m = hbucket(t, HKEY(data, h->initval, htable_bits)); - ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); - if (ret < 0) { -#ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(data, &flags); -#endif - read_unlock_bh(&set->lock); - mtype_ahash_destroy(set, t, false); - if (ret == -EAGAIN) - goto retry; - return ret; + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference_protected(hbucket(t, key), 1); + if (!m) { + m = kzalloc(sizeof(*m) + + AHASH_INIT_SIZE * dsize, + GFP_ATOMIC); + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + + (m->size + AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - d = ahash_data(m, m->pos++, set->dsize); - memcpy(d, data, set->dsize); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(d, &flags); #endif } } - rcu_assign_pointer(h->table, t); - read_unlock_bh(&set->lock); + + spin_unlock_bh(&set->lock); /* Give time to other readers of the set */ synchronize_rcu_bh(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - mtype_ahash_destroy(set, orig, false); + /* If there's nobody else dumping the table, destroy it */ + if (atomic_dec_and_test(&orig->uref)) { + pr_debug("Table destroy by resize %p\n", orig); + mtype_ahash_destroy(set, orig, false); + } - return 0; +out: +#ifdef IP_SET_HASH_WITH_NETS + kfree(tmp); +#endif + return ret; + +cleanup: + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); + spin_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + goto out; } /* Add an element to a hash and update the internal counters when succeeded, - * otherwise report the proper error code. */ + * otherwise report the proper error code. + */ static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) @@ -627,17 +709,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; - struct hbucket *n; - int i, ret = 0; - int j = AHASH_MAX(h) + 1; + struct hbucket *n, *old = ERR_PTR(-ENOENT); + int i, j = -1; bool flag_exist = flags & IPSET_FLAG_EXIST; + bool deleted = false, forceadd = false, reuse = false; u32 key, multi = 0; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + if (h->elements >= h->maxelem) { + if (SET_WITH_TIMEOUT(set)) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + forceadd = true; + } + + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) { + if (forceadd) { + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } else if (h->elements >= h->maxelem) { + goto set_full; + } + old = NULL; + n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + n->size = AHASH_INIT_SIZE; + goto copy_elem; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + /* Reuse first deleted entry */ + if (j == -1) { + deleted = reuse = true; + j = i; + } + continue; + } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { if (flag_exist || @@ -645,85 +759,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_timeout_expired(ext_timeout(data, set)))) { /* Just the extensions could be overwritten */ j = i; - goto reuse_slot; - } else { - ret = -IPSET_ERR_EXIST; - goto out; + goto overwrite_extensions; } + return -IPSET_ERR_EXIST; } /* Reuse first timed out entry */ if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set)) && - j != AHASH_MAX(h) + 1) + j == -1) { j = i; + reuse = true; + } } - if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) { - /* Choosing the first entry in the array to replace */ - j = 0; - goto reuse_slot; - } - if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h, NLEN(set->family), set->dsize); - - if (h->elements >= h->maxelem) { - if (net_ratelimit()) - pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - -reuse_slot: - if (j != AHASH_MAX(h) + 1) { - /* Fill out reused slot */ + if (reuse || forceadd) { data = ahash_data(n, j, set->dsize); + if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) { - mtype_del_cidr(h, SCIDR(data->cidr, i), - NLEN(set->family), i); - mtype_add_cidr(h, SCIDR(d->cidr, i), - NLEN(set->family), i); - } + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, i)), + NLEN(set->family), i); #endif - ip_set_ext_destroy(set, data); - } else { - /* Use/create a new slot */ + ip_set_ext_destroy(set, data); + h->elements--; + } + goto copy_data; + } + if (h->elements >= h->maxelem) + goto set_full; + /* Create a new slot */ + if (n->pos >= n->size) { TUNE_AHASH_MAX(h, multi); - ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); - if (ret != 0) { - if (ret == -EAGAIN) - mtype_data_next(&h->next, d); - goto out; + if (n->size >= AHASH_MAX(h)) { + /* Trigger rehashing */ + mtype_data_next(&h->next, d); + return -EAGAIN; } - data = ahash_data(n, n->pos++, set->dsize); + old = n; + n = kzalloc(sizeof(*n) + + (old->size + AHASH_INIT_SIZE) * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + memcpy(n, old, sizeof(struct hbucket) + + old->size * set->dsize); + n->size = old->size + AHASH_INIT_SIZE; + } + +copy_elem: + j = n->pos++; + data = ahash_data(n, j, set->dsize); +copy_data: + h->elements++; #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family), - i); + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), + NLEN(set->family), i); #endif - h->elements++; - } memcpy(data, d, sizeof(struct mtype_elem)); +overwrite_extensions: #ifdef IP_SET_HASH_WITH_NETS mtype_data_set_flags(data, flags); #endif - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(data, set), ext->timeout); if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(data, set), ext); + /* Must come last for the case when timed out entry is reused */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + smp_mb__before_atomic(); + set_bit(j, n->used); + if (old != ERR_PTR(-ENOENT)) { + rcu_assign_pointer(hbucket(t, key), n); + if (old) + kfree_rcu(old, rcu); + } -out: - rcu_read_unlock_bh(); - return ret; + return 0; +set_full: + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; } -/* Delete an element from the hash: swap it with the last element - * and free up space if possible. +/* Delete an element from the hash and free up space if possible. */ static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -734,55 +857,70 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, ret = -IPSET_ERR_EXIST; -#ifdef IP_SET_HASH_WITH_NETS - u8 j; -#endif + int i, j, k, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; + size_t dsize = set->dsize; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); - for (i = 0; i < n->pos; i++) { - data = ahash_data(n, i, set->dsize); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) + goto out; + for (i = 0, k = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + k++; + continue; + } + data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) goto out; - if (i != n->pos - 1) - /* Not last one */ - memcpy(data, ahash_data(n, n->pos - 1, set->dsize), - set->dsize); - n->pos--; + ret = 0; + clear_bit(i, n->used); + smp_mb__after_atomic(); + if (i + 1 == n->pos) + n->pos--; h->elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family), - j); + mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), + NLEN(set->family), j); #endif ip_set_ext_destroy(set, data); - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * set->dsize, - GFP_ATOMIC); - if (!tmp) { - ret = 0; + + for (; i < n->pos; i++) { + if (!test_bit(i, n->used)) + k++; + } + if (n->pos == 0 && k == 0) { + rcu_assign_pointer(hbucket(t, key), NULL); + kfree_rcu(n, rcu); + } else if (k >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) goto out; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, k = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + k * dsize, data, dsize); + set_bit(j, tmp->used); + k++; } - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * set->dsize); - kfree(n->value); - n->value = tmp; + tmp->pos = k; + rcu_assign_pointer(hbucket(t, key), tmp); + kfree_rcu(n, rcu); } - ret = 0; goto out; } out: - rcu_read_unlock_bh(); return ret; } @@ -801,7 +939,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_NETS /* Special test function which takes into account the different network - * sizes added to the set */ + * sizes added to the set + */ static int mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, const struct ip_set_ext *ext, @@ -824,16 +963,21 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) { #if IPSET_NET_COUNT == 2 mtype_data_reset_elem(d, &orig); - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi; k++) { - mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true); + mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), + true); #else - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0])); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; @@ -871,13 +1015,13 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; - rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, - * try all possible network sizes */ + * try all possible network sizes + */ for (i = 0; i < IPSET_NET_COUNT; i++) - if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) break; if (i == IPSET_NET_COUNT) { ret = mtype_test_cidrs(set, d, ext, mext, flags); @@ -886,8 +1030,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) { + ret = 0; + goto out; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi) && !(SET_WITH_TIMEOUT(set) && @@ -897,7 +1047,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } out: - rcu_read_unlock_bh(); return ret; } @@ -909,15 +1058,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u8 htable_bits; + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + htable_bits = t->htable_bits; + rcu_read_unlock_bh(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, - htonl(jhash_size(t->htable_bits))) || + htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; #ifdef IP_SET_HASH_WITH_NETMASK @@ -941,32 +1094,63 @@ nla_put_failure: return -EMSGSIZE; } +/* Make possible to run dumping parallel with resizing */ +static void +mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) +{ + struct htype *h = set->data; + struct htable *t; + + if (start) { + rcu_read_lock_bh(); + t = rcu_dereference_bh_nfnl(h->table); + atomic_inc(&t->uref); + cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; + rcu_read_unlock_bh(); + } else if (cb->args[IPSET_CB_PRIVATE]) { + t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + /* Resizing didn't destroy the hash table */ + pr_debug("Table destroy by dump: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + cb->args[IPSET_CB_PRIVATE] = 0; + } +} + /* Reply a LIST/SAVE request: dump the elements of the specified set */ static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { - const struct htype *h = set->data; - const struct htable *t = rcu_dereference_bh_nfnl(h->table); + const struct htable *t; struct nlattr *atd, *nested; const struct hbucket *n; const struct mtype_elem *e; u32 first = cb->args[IPSET_CB_ARG0]; /* We assume that one hash bucket fills into one page */ void *incomplete; - int i; + int i, ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; + /* Expire may replace a hbucket with another one */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { incomplete = skb_tail_pointer(skb); - n = hbucket(t, cb->args[IPSET_CB_ARG0]); + n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", cb->args[IPSET_CB_ARG0], t, n); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; e = ahash_data(n, i, set->dsize); if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) @@ -977,9 +1161,10 @@ mtype_list(const struct ip_set *set, if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (mtype_data_list(skb, e)) goto nla_put_failure; @@ -992,7 +1177,7 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nlmsg_trim(skb, incomplete); @@ -1000,20 +1185,24 @@ nla_put_failure: pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", set->name); cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; + } else { + ipset_nest_end(skb, atd); } - ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static int IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt); + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); static int IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + enum ipset_adt adt, u32 *lineno, u32 flags, + bool retried); static const struct ip_set_type_variant mtype_variant = { .kadt = mtype_kadt, @@ -1027,6 +1216,7 @@ static const struct ip_set_type_variant mtype_variant = { .flush = mtype_flush, .head = mtype_head, .list = mtype_list, + .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, }; @@ -1045,7 +1235,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u8 netmask; #endif size_t hsize; - struct HTYPE *h; + struct htype *h; struct htable *t; #ifndef IP_SET_PROTO_UNDEF @@ -1064,12 +1254,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || -#ifdef IP_SET_HASH_WITH_MARKMASK - !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || -#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; +#ifdef IP_SET_HASH_WITH_MARKMASK + /* Separated condition in order to avoid directive in argument list */ + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) + return -IPSET_ERR_PROTOCOL; +#endif if (tb[IPSET_ATTR_HASHSIZE]) { hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); @@ -1092,7 +1284,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (tb[IPSET_ATTR_MARKMASK]) { - markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK])); if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; @@ -1137,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem))); #ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); } #endif if (tb[IPSET_ATTR_TIMEOUT]) { @@ -1165,3 +1359,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return 0; } #endif /* IP_SET_EMIT_CREATE */ + +#undef HKEY_DATALEN diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ip.c b/kernel/net/netfilter/ipset/ip_set_hash_ip.c index 76959d79e..9d6bf19f7 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ip.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ip.c @@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1, return e1->ip == e2->ip; } -static inline bool +static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) } #define MTYPE hash_ip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -109,20 +108,17 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, hosts; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -145,7 +141,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -162,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -196,10 +192,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -208,12 +204,9 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE @@ -247,22 +240,25 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -301,7 +297,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -318,6 +315,7 @@ hash_ip_init(void) static void __exit hash_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c index 7abf9788c..a0695a2ab 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb, if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -76,10 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, next->ip = d->ip; } -#define MTYPE hash_ipmark4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#define MTYPE hash_ipmark4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -110,25 +108,22 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip, ip_to = 0; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST || @@ -147,7 +142,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -160,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -191,10 +186,10 @@ hash_ipmark6_data_list(struct sk_buff *skb, if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -204,18 +199,13 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipmark6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" - static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -243,27 +233,30 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { @@ -274,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; - return ret; + return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { @@ -307,7 +298,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -324,6 +316,7 @@ hash_ipmark_init(void) static void __exit hash_ipmark_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c index dcbcceb9a..9d84b3dff 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipport.c @@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -83,10 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, next->port = d->port; } -#define MTYPE hash_ipport4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#define MTYPE hash_ipport4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -118,29 +116,23 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -148,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -171,7 +164,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -195,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -231,10 +224,10 @@ hash_ipport6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -245,15 +238,11 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipport6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int @@ -285,31 +274,31 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -317,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -341,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -376,7 +366,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -393,6 +384,7 @@ hash_ipport_init(void) static void __exit hash_ipport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c index 7ef93fc88..215b7b942 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -63,17 +63,17 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, static bool hash_ipportip4_data_list(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) + const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next, /* Common functions */ #define MTYPE hash_ipportip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -120,22 +119,19 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -143,10 +139,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -154,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -177,7 +171,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -201,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -240,10 +234,10 @@ hash_ipportip6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -254,11 +248,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -293,24 +285,27 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -318,10 +313,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -329,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -353,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -388,7 +381,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -405,6 +399,7 @@ hash_ipportip_init(void) static void __exit hash_ipportip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c index b6012ad92..9ca719625 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, } #define MTYPE hash_ipportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -142,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -174,23 +173,20 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -205,10 +201,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -216,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -249,7 +244,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -270,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -294,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -367,10 +363,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -381,11 +377,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -398,7 +392,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -429,27 +423,28 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -466,10 +461,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip2, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -477,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -508,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -547,7 +541,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -564,6 +559,7 @@ hash_ipportnet_init(void) static void __exit hash_ipportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_mac.c b/kernel/net/netfilter/ipset/ip_set_hash_mac.c index 65690b52a..f1e7d2c0f 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_mac.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_mac.c @@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, static inline bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { - return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); + if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; } static inline void @@ -62,7 +67,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, } #define MTYPE hash_mac4 -#define PF 4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF @@ -85,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return 0; if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); @@ -103,22 +107,16 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_ETHER] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_ETHER])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -IPSET_ERR_HASH_ELEM; @@ -149,7 +147,8 @@ static struct ip_set_type hash_mac_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -166,6 +165,7 @@ hash_mac_init(void) static void __exit hash_mac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_net.c b/kernel/net/netfilter/ipset/ip_set_hash_net.c index 6b3ac10ac..3e4bffdc1 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_net.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_net.c @@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next, } #define MTYPE hash_net4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -121,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -147,21 +146,18 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, last; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -173,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -180,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt, set) ? -ret: + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -202,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -264,10 +261,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -277,11 +274,9 @@ hash_net6_data_next(struct hash_net4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_net6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -294,7 +289,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -318,36 +313,34 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - - if (!e.cidr || e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip, e.cidr); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -383,7 +376,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -400,6 +394,7 @@ hash_net_init(void) static void __exit hash_net_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_net_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c index 380ef5148..43d8c9896 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netiface.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -37,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); -/* Interface name rbtree */ - -struct iface_node { - struct rb_node node; - char iface[IFNAMSIZ]; -}; - -#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) - -static void -rbtree_destroy(struct rb_root *root) -{ - struct iface_node *node, *next; - - rbtree_postorder_for_each_entry_safe(node, next, root, node) - kfree(node); - - *root = RB_ROOT; -} - -static int -iface_test(struct rb_root *root, const char **iface) -{ - struct rb_node *n = root->rb_node; - - while (n) { - const char *d = iface_data(n); - int res = strcmp(*iface, d); - - if (res < 0) - n = n->rb_left; - else if (res > 0) - n = n->rb_right; - else { - *iface = d; - return 1; - } - } - return 0; -} - -static int -iface_add(struct rb_root *root, const char **iface) -{ - struct rb_node **n = &(root->rb_node), *p = NULL; - struct iface_node *d; - - while (*n) { - char *ifname = iface_data(*n); - int res = strcmp(*iface, ifname); - - p = *n; - if (res < 0) - n = &((*n)->rb_left); - else if (res > 0) - n = &((*n)->rb_right); - else { - *iface = ifname; - return 0; - } - } - - d = kzalloc(sizeof(*d), GFP_ATOMIC); - if (!d) - return -ENOMEM; - strcpy(d->iface, *iface); - - rb_link_node(&d->node, p, n); - rb_insert_color(&d->node, root); - - *iface = d->iface; - return 0; -} - /* Type specific function prefix */ #define HTYPE hash_netiface #define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STREQ(a, b) (strcmp(a, b) == 0) +#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -137,7 +61,7 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -151,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -193,10 +117,10 @@ hash_netiface4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -207,7 +131,6 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, } #define MTYPE hash_netiface4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) #include "ip_set_hash_gen.h" @@ -220,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb) return dev ? dev->name : NULL; } -static const char *get_phyoutdev_name(const struct sk_buff *skb) +static const char *get_physoutdev_name(const struct sk_buff *skb) { struct net_device *dev = nf_bridge_get_physoutdev(skb); @@ -236,11 +159,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -250,35 +172,25 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); -#define IFACE(dir) (par->dir ? par->dir->name : NULL) +#define IFACE(dir) (par->dir ? par->dir->name : "") #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); - if (!e.iface) + if (!eiface) return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; - return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -291,25 +203,21 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -318,21 +226,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -353,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr); + } if (retried) ip = ntohl(h->next.ip); @@ -365,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -388,7 +287,7 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -402,7 +301,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -444,10 +343,10 @@ hash_netiface6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -457,12 +356,9 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_netiface6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) #define IP_SET_EMIT_CREATE @@ -476,11 +372,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -492,85 +387,64 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - e.iface = SRCDIR ? get_physindev_name(skb) : - get_phyoutdev_name(skb); - if (!e.iface) - return -EINVAL; + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); + if (!eiface) + return -EINVAL; + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + ip6_netmask(&e.ip, e.cidr); - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -613,7 +487,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -630,6 +505,7 @@ hash_netiface_init(void) static void __exit hash_netiface_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netiface_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c index ea8772afb..a93dfebff 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netnet.c @@ -57,8 +57,8 @@ struct hash_netnet4_elem { static inline bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, - const struct hash_netnet4_elem *ip2, - u32 *multi) + const struct hash_netnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp; @@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) static inline void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, - struct hash_netnet4_elem *orig) + struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) static bool hash_netnet4_data_list(struct sk_buff *skb, - const struct hash_netnet4_elem *data) + const struct hash_netnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -122,28 +122,34 @@ nla_put_failure: static inline void hash_netnet4_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet4_elem *d) + const struct hash_netnet4_elem *d) { next->ipcmp = d->ipcmp; } #define MTYPE hash_netnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netnet4_init(struct hash_netnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -157,7 +163,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -165,45 +171,43 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr2 || cidr2 > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr2; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -226,8 +230,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -238,28 +243,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); - last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); - e.cidr[1] = cidr2; + last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = last2 + 1; } ip = last + 1; @@ -283,8 +287,8 @@ struct hash_netnet6_elem { static inline bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, - const struct hash_netnet6_elem *ip2, - u32 *multi) + const struct hash_netnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -311,7 +315,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) static inline void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, - struct hash_netnet6_elem *orig) + struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -330,7 +334,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) static bool hash_netnet6_data_list(struct sk_buff *skb, - const struct hash_netnet6_elem *data) + const struct hash_netnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -349,34 +353,39 @@ nla_put_failure: static inline void hash_netnet6_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet6_elem *d) + const struct hash_netnet6_elem *d) { } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netnet6_init(struct hash_netnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) - e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); @@ -388,50 +397,53 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -470,7 +482,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -487,6 +500,7 @@ hash_netnet_init(void) static void __exit hash_netnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netport.c b/kernel/net/netfilter/ipset/ip_set_hash_netport.c index c0ddb58d1..731813e0f 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netport.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netport.c @@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next, } #define MTYPE hash_netport4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -137,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -167,23 +166,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -194,10 +190,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -205,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -215,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -240,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -257,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } ip = last + 1; } @@ -326,10 +322,10 @@ hash_netport6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -340,11 +336,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netport6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -357,7 +351,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -387,25 +381,22 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -417,10 +408,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], } ip6_netmask(&e.ip, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -428,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -459,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -495,7 +485,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -512,6 +503,7 @@ hash_netport_init(void) static void __exit hash_netport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c index bfaa94c7b..9a14c2378 100644 --- a/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/kernel/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -54,7 +54,7 @@ struct hash_netportnet4_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -62,8 +62,8 @@ struct hash_netportnet4_elem { static inline bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, - const struct hash_netportnet4_elem *ip2, - u32 *multi) + const struct hash_netportnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp && @@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) static inline void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, - struct hash_netportnet4_elem *orig) + struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, static bool hash_netportnet4_data_list(struct sk_buff *skb, - const struct hash_netportnet4_elem *data) + const struct hash_netportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -124,37 +124,43 @@ hash_netportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet4_elem *d) + const struct hash_netportnet4_elem *d) { next->ipcmp = d->ipcmp; next->port = d->port; } #define MTYPE hash_netportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netportnet4_init(struct hash_netportnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -172,7 +178,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -181,49 +187,43 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; bool with_ports = false; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr || cidr > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -231,14 +231,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -262,8 +264,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -281,16 +284,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { @@ -301,13 +304,12 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &cidr2); - e.cidr[1] = cidr2; + &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -326,7 +328,7 @@ struct hash_netportnet6_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -334,8 +336,8 @@ struct hash_netportnet6_elem { static inline bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, - const struct hash_netportnet6_elem *ip2, - u32 *multi) + const struct hash_netportnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -364,7 +366,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) static inline void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, - struct hash_netportnet6_elem *orig) + struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -384,7 +386,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, static bool hash_netportnet6_data_list(struct sk_buff *skb, - const struct hash_netportnet6_elem *data) + const struct hash_netportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -397,41 +399,46 @@ hash_netportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet6_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet6_elem *d) + const struct hash_netportnet6_elem *d) { next->port = d->port; } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netportnet6_init(struct hash_netportnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; @@ -449,7 +456,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; @@ -459,47 +466,46 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK)) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -507,14 +513,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -538,8 +546,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -577,7 +585,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -594,6 +603,7 @@ hash_netportnet_init(void) static void __exit hash_netportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netportnet_type); } diff --git a/kernel/net/netfilter/ipset/ip_set_list_set.c b/kernel/net/netfilter/ipset/ip_set_list_set.c index f8f682806..bbede95c9 100644 --- a/kernel/net/netfilter/ipset/ip_set_list_set.c +++ b/kernel/net/netfilter/ipset/ip_set_list_set.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -27,8 +28,10 @@ MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { + struct rcu_head rcu; + struct list_head list; ip_set_id_t id; -}; +} __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; @@ -41,12 +44,9 @@ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct net *net; /* namespace */ - struct set_elem members[0]; /* the set members */ + struct list_head members; /* the set members */ }; -#define list_set_elem(set, map, id) \ - (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) - static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i, cmdflags = opt->cmdflags; + u32 cmdflags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret = -EINVAL; + rcu_read_lock(); switch (adt) { case IPSET_TEST: - return list_set_ktest(set, skb, par, opt, &ext); + ret = list_set_ktest(set, skb, par, opt, &ext); + break; case IPSET_ADD: - return list_set_kadd(set, skb, par, opt, &ext); + ret = list_set_kadd(set, skb, par, opt, &ext); + break; case IPSET_DEL: - return list_set_kdel(set, skb, par, opt, &ext); + ret = list_set_kdel(set, skb, par, opt, &ext); + break; default: break; } - return -EINVAL; -} - -static bool -id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) -{ - const struct list_set *map = set->data; - const struct set_elem *e; - - if (i >= map->size) - return 0; + rcu_read_unlock(); - e = list_set_elem(set, map, i); - return !!(e->id == id && - !(SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set)))); + return ret; } -static int -list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, - const struct ip_set_ext *ext) -{ - struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); +/* Userspace interfaces: we are protected by the nfnl mutex */ - if (e->id != IPSET_INVALID_ID) { - if (i == map->size - 1) { - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - } else { - struct set_elem *x = list_set_elem(set, map, - map->size - 1); - - /* Last element pushed off */ - if (x->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, x->id); - ip_set_ext_destroy(set, x); - } - memmove(list_set_elem(set, map, i + 1), e, - set->dsize * (map->size - (i + 1))); - /* Extensions must be initialized to zero */ - memset(e, 0, set->dsize); - } - } - - e->id = d->id; - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); - return 0; -} - -static int -list_set_del(struct ip_set *set, u32 i) +static void +__list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); ip_set_put_byindex(map->net, e->id); + /* We may call it, because we don't have a to be destroyed + * extension which is used by the kernel. + */ ip_set_ext_destroy(set, e); + kfree_rcu(e, rcu); +} - if (i < map->size - 1) - memmove(e, list_set_elem(set, map, i + 1), - set->dsize * (map->size - (i + 1))); +static inline void +list_set_del(struct ip_set *set, struct set_elem *e) +{ + list_del_rcu(&e->list); + __list_set_del(set, e); +} - /* Last element */ - e = list_set_elem(set, map, map->size - 1); - e->id = IPSET_INVALID_ID; - return 0; +static inline void +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +{ + list_replace_rcu(&old->list, &e->list); + __list_set_del(set, old); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i = 0; + struct set_elem *e, *n; - while (i < map->size) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID && - ip_set_timeout_expired(ext_timeout(e, set))) - list_set_del(set, i); - /* Check element moved to position i in next loop */ - else - i++; - } + list_for_each_entry_safe(e, n, &map->members, list) + if (ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, e); } static int @@ -250,31 +194,46 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; + struct set_elem *e, *next, *prev = NULL; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return 1; - else if (d->before > 0) - ret = id_eq(set, i + 1, d->refid); - else - ret = i > 0 && id_eq(set, i - 1, d->refid); + if (d->before == 0) { + ret = 1; + } else if (d->before > 0) { + next = list_next_entry(e, list); + ret = !list_is_last(&e->list, &map->members) && + next->id == d->refid; + } else { + ret = prev && prev->id == d->refid; + } return ret; } return 0; } +static void +list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, + struct set_elem *e) +{ + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); + /* Update timeout last */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +} static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -282,60 +241,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; + struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - u32 i, ret = 0; if (SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); - /* Check already added element */ - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto insert; - else if (e->id != d->id) + /* Find where to add the new entry */ + n = prev = next = NULL; + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - - if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || - (d->before < 0 && - (i == 0 || !id_eq(set, i - 1, d->refid)))) - /* Before/after doesn't match */ + else if (d->id == e->id) + n = e; + else if (d->before == 0 || e->id != d->refid) + continue; + else if (d->before > 0) + next = e; + else + prev = e; + } + /* Re-add already existing element */ + if (n) { + if ((d->before > 0 && !next) || + (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; if (!flag_exist) - /* Can't re-add */ return -IPSET_ERR_EXIST; /* Update extensions */ - ip_set_ext_destroy(set, e); + ip_set_ext_destroy(set, n); + list_set_init_extensions(set, ext, n); - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } -insert: - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - ret = d->before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(set, i, d, ext); - else if (e->id != d->refid) - continue; - else if (d->before > 0) - ret = list_set_add(set, i, d, ext); - else if (i + 1 < map->size) - ret = list_set_add(set, i + 1, d, ext); + /* Add new entry */ + if (d->before == 0) { + /* Append */ + n = list_empty(&map->members) ? NULL : + list_last_entry(&map->members, struct set_elem, list); + } else if (d->before > 0) { + /* Insert after next element */ + if (!list_is_last(&next->list, &map->members)) + n = list_next_entry(next, list); + } else { + /* Insert before prev element */ + if (prev->list.prev != &map->members) + n = list_prev_entry(prev, list); } + /* Can we replace a timed out entry? */ + if (n && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(n, set)))) + n = NULL; + + e = kzalloc(set->dsize, GFP_ATOMIC); + if (!e) + return -ENOMEM; + e->id = d->id; + INIT_LIST_HEAD(&e->list); + list_set_init_extensions(set, ext, e); + if (n) + list_set_replace(set, e, n); + else if (next) + list_add_tail_rcu(&e->list, &next->list); + else if (prev) + list_add_rcu(&e->list, &prev->list); + else + list_add_tail_rcu(&e->list, &map->members); - return ret; + return 0; } static int @@ -344,32 +321,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return d->before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + struct set_elem *e, *next, *prev = NULL; + + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return list_set_del(set, i); - else if (d->before > 0) { - if (!id_eq(set, i + 1, d->refid)) + if (d->before > 0) { + next = list_next_entry(e, list); + if (list_is_last(&e->list, &map->members) || + next->id != d->refid) return -IPSET_ERR_REF_EXIST; - return list_set_del(set, i); - } else if (i == 0 || !id_eq(set, i - 1, d->refid)) - return -IPSET_ERR_REF_EXIST; - else - return list_set_del(set, i); + } else if (d->before < 0) { + if (!prev || prev->id != d->refid) + return -IPSET_ERR_REF_EXIST; + } + list_set_del(set, e); + return 0; } - return -IPSET_ERR_EXIST; + return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int @@ -383,19 +358,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set *s; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_NAME] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -410,6 +379,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + e.before = f & IPSET_FLAG_BEFORE; } @@ -447,27 +417,26 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - e->id = IPSET_INVALID_ID; - } - } + struct set_elem *e, *n; + + list_for_each_entry_safe(e, n, &map->members, list) + list_set_del(set, e); } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; + struct set_elem *e, *n; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - list_set_flush(set); + list_for_each_entry_safe(e, n, &map->members, list) { + list_del(&e->list); + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + kfree(e); + } kfree(map); set->data = NULL; @@ -478,6 +447,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; + struct set_elem *e; + u32 n = 0; + + list_for_each_entry(e, &map->members, list) + n++; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) @@ -485,7 +459,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->size * set->dsize))) + htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -502,18 +476,22 @@ list_set_list(const struct ip_set *set, { const struct list_set *map = set->data; struct nlattr *atd, *nested; - u32 i, first = cb->args[IPSET_CB_ARG0]; - const struct set_elem *e; + u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + struct set_elem *e; + int ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - for (; cb->args[IPSET_CB_ARG0] < map->size; - cb->args[IPSET_CB_ARG0]++) { - i = cb->args[IPSET_CB_ARG0]; - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto finish; + list_for_each_entry(e, &map->members, list) { + if (i == first) + break; + i++; + } + + rcu_read_lock(); + list_for_each_entry_from(e, &map->members, list) { + i++; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -521,9 +499,10 @@ list_set_list(const struct ip_set *set, if (!nested) { if (i == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) @@ -532,20 +511,23 @@ list_set_list(const struct ip_set *set, goto nla_put_failure; ipset_nest_end(skb, nested); } -finish: + ipset_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } + cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static bool @@ -577,12 +559,12 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct list_set *map = set->data; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set_cleanup_entries(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -594,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct list_set *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -606,24 +588,16 @@ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; - struct set_elem *e; - u32 i; - map = kzalloc(sizeof(*map) + - min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, - GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; + INIT_LIST_HEAD(&map->members); set->data = map; - for (i = 0; i < size; i++) { - e = list_set_elem(set, map, i); - e->id = IPSET_INVALID_ID; - } - return true; } @@ -644,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], size = IP_SET_LIST_MIN_SIZE; set->variant = &set_variant; - set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), + __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { @@ -678,7 +653,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -695,6 +671,7 @@ list_set_init(void) static void __exit list_set_fini(void) { + rcu_barrier(); ip_set_type_unregister(&list_set_type); } diff --git a/kernel/net/netfilter/ipset/pfxlen.c b/kernel/net/netfilter/ipset/pfxlen.c index 04d15fdc9..1c8a42c10 100644 --- a/kernel/net/netfilter/ipset/pfxlen.c +++ b/kernel/net/netfilter/ipset/pfxlen.c @@ -1,9 +1,7 @@ #include #include -/* - * Prefixlen maps for fast conversions, by Jan Engelhardt. - */ +/* Prefixlen maps for fast conversions, by Jan Engelhardt. */ #define E(a, b, c, d) \ {.ip6 = { \ @@ -11,8 +9,7 @@ htonl(c), htonl(d), \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { @@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = { EXPORT_SYMBOL_GPL(ip_set_netmask_map); #undef E -#define E(a, b, c, d) \ - {.ip6 = { (__force __be32) a, (__force __be32) b, \ - (__force __be32) c, (__force __be32) d, \ +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32)a, (__force __be32)b, \ + (__force __be32)c, (__force __be32)d, \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { diff --git a/kernel/net/netfilter/ipvs/Kconfig b/kernel/net/netfilter/ipvs/Kconfig index 3b6929dec..b32fb0dbe 100644 --- a/kernel/net/netfilter/ipvs/Kconfig +++ b/kernel/net/netfilter/ipvs/Kconfig @@ -162,6 +162,17 @@ config IP_VS_FO If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_OVF + tristate "weighted overflow scheduling" + ---help--- + The weighted overflow scheduling algorithm directs network + connections to the server with the highest weight that is + currently available and overflows to the next when active + connections exceed the node's weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/kernel/net/netfilter/ipvs/Makefile b/kernel/net/netfilter/ipvs/Makefile index 38b2723b2..67f3f4389 100644 --- a/kernel/net/netfilter/ipvs/Makefile +++ b/kernel/net/netfilter/ipvs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o +obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/kernel/net/netfilter/ipvs/ip_vs_app.c b/kernel/net/netfilter/ipvs/ip_vs_app.c index dfd7b65b3..0328f7250 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_app.c +++ b/kernel/net/netfilter/ipvs/ip_vs_app.c @@ -75,7 +75,7 @@ static void ip_vs_app_inc_rcu_free(struct rcu_head *head) * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, +ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { struct ip_vs_protocol *pp; @@ -107,7 +107,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, } } - ret = pp->register_app(net, inc); + ret = pp->register_app(ipvs, inc); if (ret) goto out; @@ -127,7 +127,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, * Release app incarnation */ static void -ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -135,7 +135,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(net, inc); + pp->unregister_app(ipvs, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); @@ -175,14 +175,14 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, +register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, __u16 port) { int result; mutex_lock(&__ip_vs_app_mutex); - result = ip_vs_app_inc_new(net, app, proto, port); + result = ip_vs_app_inc_new(ipvs, app, proto, port); mutex_unlock(&__ip_vs_app_mutex); @@ -191,15 +191,11 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, /* Register application for netns */ -struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app) +struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a; int err = 0; - if (!ipvs) - return ERR_PTR(-ENOENT); - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry(a, &ipvs->app_list, a_list) { @@ -230,21 +226,17 @@ out_unlock: * We are sure there are no app incarnations attached to services * Caller should use synchronize_rcu() or rcu_barrier() */ -void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *a, *anxt, *inc, *nxt; - if (!ipvs) - return; - mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { if (app && strcmp(app->name, a->name)) continue; list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { - ip_vs_app_inc_release(net, inc); + ip_vs_app_inc_release(ipvs, inc); } list_del(&a->a_list); @@ -611,17 +603,19 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -int __net_init ip_vs_app_net_init(struct net *net) +int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; INIT_LIST_HEAD(&ipvs->app_list); proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); return 0; } -void __net_exit ip_vs_app_net_cleanup(struct net *net) +void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - unregister_ip_vs_app(net, NULL /* all */); + struct net *net = ipvs->net; + + unregister_ip_vs_app(ipvs, NULL /* all */); remove_proc_entry("ip_vs_app", net->proc_net); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_conn.c b/kernel/net/netfilter/ipvs/ip_vs_conn.c index b0f7b626b..85ca189bd 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_conn.c +++ b/kernel/net/netfilter/ipvs/ip_vs_conn.c @@ -108,7 +108,7 @@ static inline void ct_write_unlock_bh(unsigned int key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, +static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -116,11 +116,11 @@ static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int pro if (af == AF_INET6) return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; #endif return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)net>>8)) & ip_vs_conn_tab_mask; + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -141,14 +141,14 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); } static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, &cp->caddr, cp->cport, NULL, 0, &p); if (cp->pe) { @@ -279,7 +279,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -314,33 +314,34 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) } static int -ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, +ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, + int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, - int inverse, struct ip_vs_conn_param *p) + struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; - struct net *net = skb_net(skb); pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return 1; - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], p); else - ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, pptr[1], &iph->saddr, pptr[0], p); return 0; } struct ip_vs_conn * -ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_in_get(&p); @@ -359,7 +360,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (unlikely(p->pe_data && p->pe->ct_match)) { - if (!ip_vs_conn_net_eq(cp, p->net)) + if (cp->ipvs != p->ipvs) continue; if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { if (__ip_vs_conn_get(cp)) @@ -377,7 +378,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) p->vport == cp->vport && p->cport == cp->cport && cp->flags & IP_VS_CONN_F_TEMPLATE && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (__ip_vs_conn_get(cp)) goto out; } @@ -418,7 +419,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && p->protocol == cp->protocol && - ip_vs_conn_net_eq(cp, p->net)) { + cp->ipvs == p->ipvs) { if (!__ip_vs_conn_get(cp)) continue; /* HIT */ @@ -439,12 +440,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) } struct ip_vs_conn * -ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn_param p; - if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p)) + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) return NULL; return ip_vs_conn_out_get(&p); @@ -638,7 +640,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) * so we can make the assumption that the svc_af is the same as the * dest_af */ - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, + dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { @@ -668,7 +670,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) #endif ip_vs_bind_xmit(cp); - pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); if (pd && atomic_read(&pd->appcnt)) ip_vs_bind_app(cp, pd->pp); } @@ -746,7 +748,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs, int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + struct netns_ipvs *ipvs = ct->ipvs; /* * Checking the dest server status. @@ -800,8 +802,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct net *net = ip_vs_conn_net(cp); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; /* * do I control anybody? @@ -847,7 +848,7 @@ static void ip_vs_conn_expire(unsigned long data) cp->timeout = 60*HZ; if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); ip_vs_conn_put(cp); } @@ -875,8 +876,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(p->net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, p->protocol); cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); @@ -887,7 +888,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - ip_vs_conn_net_set(cp, p->net); + cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; cp->protocol = p->protocol; @@ -1061,7 +1062,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; if (cp->pe_data) { pe_data[0] = ' '; @@ -1146,7 +1147,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) const struct ip_vs_conn *cp = v; struct net *net = seq_file_net(seq); - if (!ip_vs_conn_net_eq(cp, net)) + if (!net_eq(cp->ipvs->net, net)) return 0; #ifdef CONFIG_IP_VS_IPV6 @@ -1240,7 +1241,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(struct net *net) +void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; @@ -1256,7 +1257,7 @@ void ip_vs_random_dropentry(struct net *net) if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { @@ -1308,18 +1309,17 @@ void ip_vs_random_dropentry(struct net *net) /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(struct net *net) +static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { int idx; struct ip_vs_conn *cp, *cp_c; - struct netns_ipvs *ipvs = net_ipvs(net); flush_again: rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (!ip_vs_conn_net_eq(cp, net)) + if (cp->ipvs != ipvs) continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); @@ -1345,23 +1345,22 @@ flush_again: /* * per netns init and exit */ -int __net_init ip_vs_conn_net_init(struct net *net) +int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - atomic_set(&ipvs->conn_count, 0); - proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops); - proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops); + proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops); + proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_fops); return 0; } -void __net_exit ip_vs_conn_net_cleanup(struct net *net) +void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ - ip_vs_conn_flush(net); - remove_proc_entry("ip_vs_conn", net->proc_net); - remove_proc_entry("ip_vs_conn_sync", net->proc_net); + ip_vs_conn_flush(ipvs); + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); + remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); } int __init ip_vs_conn_init(void) diff --git a/kernel/net/netfilter/ipvs/ip_vs_core.c b/kernel/net/netfilter/ipvs/ip_vs_core.c index 5d2b806a8..f57b4dcdb 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_core.c +++ b/kernel/net/netfilter/ipvs/ip_vs_core.c @@ -112,7 +112,7 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -146,7 +146,7 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; @@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); @@ -215,7 +215,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { - ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) @@ -245,20 +245,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc, const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ + const union nf_inet_addr *src_addr, *dst_addr; + + if (likely(!ip_vs_iph_inverse(iph))) { + src_addr = &iph->saddr; + dst_addr = &iph->daddr; + } else { + src_addr = &iph->daddr; + dst_addr = &iph->saddr; + } + /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, + ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif - snet.ip = iph->saddr.ip & svc->netmask; + snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port), - IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -276,7 +286,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, */ { int protocol = iph->protocol; - const union nf_inet_addr *vaddr = &iph->daddr; + const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { @@ -319,7 +329,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -360,8 +376,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, - src_port, &iph->daddr, dst_port, ¶m); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, + src_port, dst_addr, dst_port, ¶m); cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, skb->mark); @@ -412,7 +428,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; + __be16 _ports[2], *pptr, cport, vport; + const void *caddr, *vaddr; unsigned int flags; *ignored = 1; @@ -423,14 +440,26 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if (pptr == NULL) return NULL; + if (likely(!ip_vs_iph_inverse(iph))) { + cport = pptr[0]; + caddr = &iph->saddr; + vport = pptr[1]; + vaddr = &iph->daddr; + } else { + cport = pptr[1]; + caddr = &iph->daddr; + vport = pptr[0]; + vaddr = &iph->saddr; + } + /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ - if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + if (cport == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } @@ -438,19 +467,25 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Do not schedule replies from local real server. */ - if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (cp = pp->conn_in_get(svc->af, skb, iph, 1))) { - IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, - "Not scheduling reply for existing connection"); - __ip_vs_conn_put(cp); - return NULL; + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + + if (cp) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, + "Not scheduling reply for existing" + " connection"); + __ip_vs_conn_put(cp); + return NULL; + } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored, + return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; @@ -458,7 +493,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Non-persistent service */ - if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " @@ -467,7 +502,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; @@ -483,11 +524,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, - &iph->saddr, pptr[0], &iph->daddr, - pptr[1], &p); + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, - dest->port ? dest->port : pptr[1], + dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; @@ -507,6 +547,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return cp; } +static inline int ip_vs_addr_is_unicast(struct net *net, int af, + union nf_inet_addr *addr) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; +#endif + return (inet_addr_type(net, addr->ip) == RTN_UNICAST); +} /* * Pass or drop the packet. @@ -516,33 +565,21 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { - __be16 _ports[2], *pptr; -#ifdef CONFIG_SYSCTL - struct net *net; - struct netns_ipvs *ipvs; - int unicast; -#endif + __be16 _ports[2], *pptr, dport; + struct netns_ipvs *ipvs = svc->ipvs; + struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); - if (pptr == NULL) { + if (!pptr) return NF_DROP; - } - -#ifdef CONFIG_SYSCTL - net = skb_net(skb); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST); + dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - ipvs = net_ipvs(net); - if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + if (sysctl_cache_bypass(ipvs) && svc->fwmark && + !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && + ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && @@ -554,7 +591,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, @@ -578,7 +615,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_put(cp); return ret; } -#endif /* * When the virtual ftp service is presented, packets destined @@ -586,9 +622,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) + if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; + if (unlikely(ip_vs_iph_icmp(iph))) + return NF_DROP; + /* * Notify the client that the destination is unreachable, and * release the socket buffer. @@ -598,11 +637,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { - if (!skb->dev) { - struct net *net_ = dev_net(skb_dst(skb)->dev); - - skb->dev = net_->loopback_dev; - } + if (!skb->dev) + skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif @@ -613,15 +649,13 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, #ifdef CONFIG_SYSCTL -static int sysctl_snat_reroute(struct sk_buff *skb) +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); return ipvs->sysctl_snat_reroute; } -static int sysctl_nat_icmp_send(struct net *net) +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); return ipvs->sysctl_nat_icmp_send; } @@ -632,8 +666,8 @@ static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) #else -static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } -static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -652,12 +686,13 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) return IP_DEFRAG_VS_OUT; } -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, + struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); @@ -665,10 +700,10 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } -static int ip_vs_route_me_harder(int af, struct sk_buff *skb, - unsigned int hooknum) +static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, + struct sk_buff *skb, unsigned int hooknum) { - if (!sysctl_snat_reroute(skb)) + if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) @@ -678,12 +713,12 @@ static int ip_vs_route_me_harder(int af, struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && - ip6_route_me_harder(skb) != 0) + ip6_route_me_harder(ipvs->net, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && - ip_route_me_harder(skb, RTN_LOCAL) != 0) + ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) return 1; return 0; @@ -836,7 +871,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; /* do the statistics and put it back */ @@ -860,8 +895,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related, - unsigned int hooknum) +static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -876,7 +911,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -922,10 +957,10 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); + /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; @@ -935,16 +970,16 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *ipvsh) +static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; - unsigned int writable; + unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh); @@ -972,31 +1007,23 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); - /* Now find the contained IP header */ - ciph.len = ipvsh->len + sizeof(_icmph); - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), + true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1); + cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; - writable = ciph.len; + offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, - pp, writable, sizeof(struct ipv6hdr), + pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif @@ -1081,7 +1108,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, { struct ip_vs_protocol *pp = pd->pp; - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (!skb_make_writable(skb, iph->len)) goto drop; @@ -1115,10 +1142,10 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_vs_route_me_harder(af, skb, hooknum)) + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; - IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); @@ -1143,13 +1170,13 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + struct sock *sk; EnterFunction(11); @@ -1157,29 +1184,27 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (skb->ipvs_property) return NF_ACCEPT; + sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - net = skb_net(skb); - if (!net_ipvs(net)->enable) + if (!ipvs->enable) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_out_icmp_v6(skb, &related, + int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) @@ -1189,13 +1214,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_out_icmp(skb, &related, hooknum); + int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } - pd = ip_vs_proto_data_get(net, iph.protocol); + pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; @@ -1205,21 +1230,21 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; - ip_vs_fill_ip4hdr(skb_network_header(skb), &iph); + ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, &iph, 0); + cp = pp->conn_out_get(ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); - if (sysctl_nat_icmp_send(net) && + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { @@ -1229,7 +1254,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports, &iph); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr, + if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no @@ -1246,7 +1271,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) - skb->dev = net->loopback_dev; + skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, @@ -1260,7 +1285,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, af, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1271,10 +1296,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1282,10 +1307,10 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1296,10 +1321,10 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1307,14 +1332,51 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Check if packet is reply for established ip_vs_conn. */ static unsigned int -ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_reply6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_out(ops->hooknum, skb, AF_INET6); + return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif +static unsigned int +ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + + if (!iph->fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + + /* Schedule and create new connection entry into cpp */ + if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) + return 0; + } + + if (unlikely(!*cpp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, iph->off, + "ip_vs_in: packet continues traversal as normal"); + if (iph->fragoffs) { + /* Fragment that couldn't be mapped to a conn entry + * is missing module nf_defrag_ipv6 + */ + IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + IP_VS_DBG_PKT(7, af, pp, skb, iph->off, + "unhandled fragment"); + } + *verdict = NF_ACCEPT; + return 0; + } + + return 1; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1322,9 +1384,9 @@ ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Currently handles error types - unreachable, quench, ttl exceeded. */ static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, + unsigned int hooknum) { - struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ @@ -1333,13 +1395,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; - bool ipip; + bool ipip, new_cp = false; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { - if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1373,8 +1435,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - net = skb_net(skb); - /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { @@ -1390,7 +1450,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ipip = true; } - pd = ip_vs_proto_data_get(net, cih->protocol); + pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1404,15 +1464,24 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) "Checking incoming ICMP for"); offset2 = offset; - ip_vs_fill_ip4hdr(cih, &ciph); - ciph.len += offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); offset = ciph.len; + /* The embedded headers contain source and dest in reverse order. * For IPIP this is error for request, not for reply. */ - cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1); - if (!cp) - return NF_ACCEPT; + cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) + return v; + new_cp = true; + } verdict = NF_DROP; @@ -1443,7 +1512,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - ipv4_update_pmtu(skb, dev_net(skb->dev), + ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) @@ -1489,23 +1558,26 @@ ignore_ipip: verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: - __ip_vs_conn_put(cp); + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, - unsigned int hooknum, struct ip_vs_iphdr *iph) +static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *iph) { - struct net *net = NULL; - struct ipv6hdr _ip6h, *ip6h; struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; - unsigned int offs_ciph, writable, verdict; + unsigned int offset, verdict; + bool new_cp = false; *related = 1; @@ -1534,21 +1606,11 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); - /* Now find the contained IP header */ - ciph.len = iph->len + sizeof(_icmph); - offs_ciph = ciph.len; /* Save ip header offset */ - ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h); - if (ip6h == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */ - ciph.daddr.in6 = ip6h->daddr; - /* skip possible IPv6 exthdrs of contained IPv6 packet */ - ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL); - if (ciph.protocol < 0) - return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */ - - net = skb_net(skb); - pd = ip_vs_proto_data_get(net, ciph.protocol); + offset = iph->len + sizeof(_icmph); + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) + return NF_ACCEPT; + + pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; @@ -1557,36 +1619,49 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, if (ciph.fragoffs) return NF_ACCEPT; - IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph, + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ - cp = pp->conn_in_get(AF_INET6, skb, &ciph, - (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1); + cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) + return v; + + new_cp = true; + } - if (!cp) - return NF_ACCEPT; /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { - __ip_vs_conn_put(cp); - return NF_ACCEPT; + verdict = NF_ACCEPT; + goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ - writable = ciph.len; + offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) - writable += 2 * sizeof(__u16); /* Also mangle ports */ + offset += 2 * sizeof(__u16); /* Also mangle ports */ - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); - __ip_vs_conn_put(cp); +out: + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); return verdict; } @@ -1598,16 +1673,15 @@ static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { - struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; - struct netns_ipvs *ipvs; int conn_reuse_mode; + struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1621,7 +1695,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, @@ -1629,20 +1703,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, &iph); + ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + sk = skb_to_full_sk(skb); + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } @@ -1650,8 +1721,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; - int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, - &iph); + int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, + hooknum, &iph); if (related) return verdict; @@ -1660,21 +1731,30 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; - int verdict = ip_vs_in_icmp(skb, &related, hooknum); + int verdict = ip_vs_in_icmp(ipvs, skb, &related, + hooknum); if (related) return verdict; } /* Protocol supported? */ - pd = ip_vs_proto_data_get(net, iph.protocol); - if (unlikely(!pd)) + pd = ip_vs_proto_data_get(ipvs, iph.protocol); + if (unlikely(!pd)) { + /* The only way we'll see this packet again is if it's + * encapsulated, so mark it with ipvs_property=1 so we + * skip it if we're ignoring tunneled packets + */ + if (sysctl_ignore_tunneled(ipvs)) + skb->ipvs_property = 1; + return NF_ACCEPT; + } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, &iph, 0); + cp = pp->conn_in_get(ipvs, af, skb, &iph); conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && @@ -1688,32 +1768,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) cp = NULL; } - if (unlikely(!cp) && !iph.fragoffs) { - /* No (second) fragments need to enter here, as nf_defrag_ipv6 - * replayed fragment zero will already have created the cp - */ + if (unlikely(!cp)) { int v; - /* Schedule and create new connection entry into &cp */ - if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph)) + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, af, pp, skb, 0, - "ip_vs_in: packet continues traversal as normal"); - if (iph.fragoffs) { - /* Fragment that couldn't be mapped to a conn entry - * is missing module nf_defrag_ipv6 - */ - IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); - IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment"); - } - return NF_ACCEPT; - } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1753,7 +1816,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) pkts = atomic_add_return(1, &cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) - ip_vs_sync_conn(net, cp, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1764,10 +1827,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } /* @@ -1775,10 +1838,10 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); } #ifdef CONFIG_IP_VS_IPV6 @@ -1788,10 +1851,10 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from remote clients */ static unsigned int -ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_remote_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } /* @@ -1799,10 +1862,10 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * Schedule and forward packets from local clients */ static unsigned int -ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_local_request6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip_vs_in(ops->hooknum, skb, AF_INET6); + return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); } #endif @@ -1818,46 +1881,40 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, * and send them to ip_vs_in_icmp. */ static unsigned int -ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(skb, &r, ops->hooknum); + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } #ifdef CONFIG_IP_VS_IPV6 static unsigned int -ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, +ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int r; - struct net *net; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = net_ipvs(state->net); struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; /* ipvs enabled in this netns ? */ - net = skb_net(skb); - ipvs = net_ipvs(net); if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr); + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); } #endif @@ -1866,7 +1923,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -1876,7 +1932,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * applied to IPVS. */ { .hook = ip_vs_remote_request4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, @@ -1884,7 +1939,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, @@ -1892,7 +1946,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -1901,7 +1954,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, @@ -1909,7 +1961,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -1918,7 +1969,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -1928,7 +1978,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * applied to IPVS. */ { .hook = ip_vs_remote_request6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, @@ -1936,7 +1985,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_local_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, @@ -1944,7 +1992,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After mangle, schedule and forward local requests */ { .hook = ip_vs_local_request6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, @@ -1953,7 +2000,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, @@ -1961,7 +2007,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, - .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -1987,22 +2032,22 @@ static int __net_init __ip_vs_init(struct net *net) atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; - if (ip_vs_estimator_net_init(net) < 0) + if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; - if (ip_vs_control_net_init(net) < 0) + if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; - if (ip_vs_protocol_net_init(net) < 0) + if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; - if (ip_vs_app_net_init(net) < 0) + if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; - if (ip_vs_conn_net_init(net) < 0) + if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; - if (ip_vs_sync_net_init(net) < 0) + if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", @@ -2013,15 +2058,15 @@ static int __net_init __ip_vs_init(struct net *net) */ sync_fail: - ip_vs_conn_net_cleanup(net); + ip_vs_conn_net_cleanup(ipvs); conn_fail: - ip_vs_app_net_cleanup(net); + ip_vs_app_net_cleanup(ipvs); app_fail: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); protocol_fail: - ip_vs_control_net_cleanup(net); + ip_vs_control_net_cleanup(ipvs); control_fail: - ip_vs_estimator_net_cleanup(net); + ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; @@ -2029,22 +2074,25 @@ estimator_fail: static void __net_exit __ip_vs_cleanup(struct net *net) { - ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(net); - ip_vs_app_net_cleanup(net); - ip_vs_protocol_net_cleanup(net); - ip_vs_control_net_cleanup(net); - ip_vs_estimator_net_cleanup(net); - IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); EnterFunction(2); - net_ipvs(net)->enable = 0; /* Disable packet reception */ + ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); - ip_vs_sync_net_cleanup(net); + ip_vs_sync_net_cleanup(ipvs); LeaveFunction(2); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_ctl.c b/kernel/net/netfilter/ipvs/ip_vs_ctl.c index 285eae3a1..e7c1b052c 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_ctl.c +++ b/kernel/net/netfilter/ipvs/ip_vs_ctl.c @@ -228,7 +228,7 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) - ip_vs_random_dropentry(ipvs->net); + ip_vs_random_dropentry(ipvs); schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif @@ -263,7 +263,7 @@ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; * Returns hash value for virtual service */ static inline unsigned int -ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, +ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { register unsigned int porth = ntohs(port); @@ -276,7 +276,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); - ahash ^= ((size_t) net >> 8); + ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; @@ -285,9 +285,9 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { - return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; + return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* @@ -309,14 +309,14 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } @@ -357,21 +357,21 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_find(struct net *net, int af, __u16 protocol, +__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -385,17 +385,17 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(net, fwmark); + hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af - && net_eq(svc->net, net)) { + && (svc->ipvs == ipvs)) { /* HIT */ return svc; } @@ -406,17 +406,16 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) /* Find service, called under RCU lock */ struct ip_vs_service * -ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, +ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - struct netns_ipvs *ipvs = net_ipvs(net); /* * Check the table hashed by fwmark first */ if (fwmark) { - svc = __ip_vs_svc_fwm_find(net, af, fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } @@ -425,7 +424,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -435,7 +434,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -443,7 +442,7 @@ ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol, /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: @@ -543,10 +542,9 @@ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) } /* Check if real service by is present */ -bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, +bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash; struct ip_vs_dest *dest; @@ -601,7 +599,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, +struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -612,7 +610,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) @@ -660,7 +658,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash @@ -715,10 +713,9 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest) * are expired, and the refcnt of each destination in the trash must * be 0, so we simply release them here. */ -static void ip_vs_trash_cleanup(struct net *net) +static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - struct netns_ipvs *ipvs = net_ipvs(net); del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ @@ -788,7 +785,7 @@ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; @@ -842,15 +839,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { - ip_vs_start_estimator(svc->net, &dest->stats); + ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; - if (sched->add_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { - if (sched->upd_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } @@ -873,12 +871,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(svc->net, udest->addr.ip); + atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } @@ -1035,12 +1033,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, +static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_stop_estimator(net, &dest->stats); + ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. @@ -1078,13 +1074,13 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, svc->num_dests--; if (dest->af != svc->af) - net_ipvs(svc->net)->mixed_address_family_dests--; + svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); - if (sched->del_dest) + if (sched && sched->del_dest) sched->del_dest(svc, dest); } } @@ -1119,7 +1115,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(svc->net, dest, false); + __ip_vs_del_dest(svc->ipvs, dest, false); LeaveFunction(2); @@ -1128,8 +1124,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) static void ip_vs_dest_trash_expire(unsigned long data) { - struct net *net = (struct net *) data; - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = (struct netns_ipvs *)data; struct ip_vs_dest *dest, *next; unsigned long now = jiffies; @@ -1162,24 +1157,26 @@ static void ip_vs_dest_trash_expire(unsigned long data) * Add a service into the service hash table */ static int -ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0, i; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; - struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - ret = -ENOENT; - goto out_err; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } } if (u->pe_name && *u->pe_name) { @@ -1233,17 +1230,19 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; - svc->net = net; + svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); @@ -1255,7 +1254,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - ip_vs_start_estimator(net, &svc->stats); + ip_vs_start_estimator(ipvs, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1291,17 +1290,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { - struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* * Lookup the scheduler, by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - return -ENOENT; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } } old_sched = sched; @@ -1329,14 +1331,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } /* Bind the new scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) { - old_sched = sched; - goto out; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } } - /* Unbind the old scheduler on success */ - ip_vs_unbind_scheduler(svc, old_sched); } /* @@ -1366,7 +1374,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; - struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct netns_ipvs *ipvs = svc->ipvs; pr_info("%s: enter\n", __func__); @@ -1374,7 +1382,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) if (svc->af == AF_INET) ipvs->num_services--; - ip_vs_stop_estimator(svc->net, &svc->stats); + ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); @@ -1390,7 +1398,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(svc->net, dest, cleanup); + __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* @@ -1441,7 +1449,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(struct net *net, bool cleanup) +static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; @@ -1453,7 +1461,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1464,7 +1472,7 @@ static int ip_vs_flush(struct net *net, bool cleanup) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } @@ -1476,12 +1484,12 @@ static int ip_vs_flush(struct net *net, bool cleanup) * Delete service by {netns} in the service table. * Called by __ip_vs_cleanup() */ -void ip_vs_service_net_cleanup(struct net *net) +void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) { EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(net, true); + ip_vs_flush(ipvs, true); mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -1525,7 +1533,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1534,7 +1542,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) { + if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); @@ -1568,26 +1576,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) return 0; } -static int ip_vs_zero_all(struct net *net) +static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net)) + if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } - ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats); return 0; } @@ -1600,7 +1608,7 @@ static int proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; @@ -1611,7 +1619,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - update_defense_level(net_ipvs(net)); + update_defense_level(ipvs); } } return rc; @@ -1829,6 +1837,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "schedule_icmp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_tunneled", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1874,6 +1894,7 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; @@ -1881,7 +1902,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1893,7 +1914,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (net_eq(svc->net, net) && pos-- == 0) { + if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -1982,6 +2003,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1990,18 +2012,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - sched->name); + sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - sched->name, + sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, sched->name, + svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2180,7 +2202,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = { /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2193,13 +2215,13 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } @@ -2207,7 +2229,7 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } @@ -2319,24 +2341,34 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - mutex_lock(&ipvs->sync_mutex); - if (cmd == IP_VS_SO_SET_STARTDAEMON) - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, - dm->syncid); - else - ret = stop_sync_thread(net, dm->state); - mutex_unlock(&ipvs->sync_mutex); + if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ipvs_sync_daemon_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + strlcpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)); + cfg.syncid = dm->syncid; + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &cfg, dm->state); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + } else { + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, dm->state); + mutex_unlock(&ipvs->sync_mutex); + } goto out_dec; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } @@ -2351,7 +2383,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out_unlock; } } @@ -2369,10 +2401,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) - svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD @@ -2386,7 +2418,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2427,13 +2459,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; + char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2443,7 +2477,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(struct net *net, +__ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { @@ -2455,7 +2489,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2474,7 +2508,7 @@ __ip_vs_get_service_entries(struct net *net, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || !net_eq(svc->net, net)) + if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) @@ -2494,7 +2528,7 @@ out: } static inline int -__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; @@ -2503,9 +2537,9 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, rcu_read_lock(); if (get->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); @@ -2550,7 +2584,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, } static inline void -__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; @@ -2559,12 +2593,12 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP - pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif @@ -2627,15 +2661,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ipvs->master_syncid; + d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ipvs->backup_syncid; + d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -2683,7 +2717,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(net, get, user); + ret = __ip_vs_get_service_entries(ipvs, get, user); } break; @@ -2697,9 +2731,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) - svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else - svc = __ip_vs_service_find(net, AF_INET, + svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); @@ -2725,7 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(net, get, user); + ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; @@ -2733,7 +2767,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } @@ -2790,6 +2824,11 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ @@ -2892,6 +2931,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; + char *sched_name; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2910,8 +2950,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || @@ -2961,12 +3002,13 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2977,7 +3019,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start || !net_eq(svc->net, net)) + if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2993,7 +3035,7 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct net *net, +static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, struct ip_vs_service **ret_svc) @@ -3038,9 +3080,9 @@ static int ip_vs_genl_parse_service(struct net *net, rcu_read_lock(); if (usvc->fwmark) - svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else - svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; @@ -3078,14 +3120,14 @@ static int ip_vs_genl_parse_service(struct net *net, return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, +static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc); return ret ? ERR_PTR(ret) : svc; } @@ -3160,7 +3202,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); @@ -3170,7 +3213,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, goto out_err; - svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3246,7 +3289,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid) + struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; @@ -3255,9 +3298,23 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || - nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || - nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; +#ifdef CONFIG_IP_VS_IPV6 + if (c->mcast_af == AF_INET6) { + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, + &c->mcast_group.in6)) + goto nla_put_failure; + } else +#endif + if (c->mcast_af == AF_INET && + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, + c->mcast_group.ip)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3268,7 +3325,7 @@ nla_put_failure: } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid, + struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; @@ -3278,7 +3335,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, if (!hdr) return -EMSGSIZE; - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); @@ -3292,14 +3349,13 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb_sknet(skb); + struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ipvs->master_mcast_ifn, - ipvs->master_syncid, cb) < 0) + &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; @@ -3307,8 +3363,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ipvs->backup_mcast_ifn, - ipvs->backup_syncid, cb) < 0) + &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3320,39 +3375,90 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + struct ipvs_sync_daemon_cfg c; + struct nlattr *a; + int ret; + + memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + sizeof(c.mcast_ifn)); + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); + + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; + if (a) + c.sync_maxlen = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; + if (a) { + c.mcast_af = AF_INET; + c.mcast_group.ip = nla_get_in_addr(a); + if (!ipv4_is_multicast(c.mcast_group.ip)) + return -EINVAL; + } else { + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; + if (a) { +#ifdef CONFIG_IP_VS_IPV6 + int addr_type; + + c.mcast_af = AF_INET6; + c.mcast_group.in6 = nla_get_in6_addr(a); + addr_type = ipv6_addr_type(&c.mcast_group.in6); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + return -EINVAL; +#else + return -EAFNOSUPPORT; +#endif + } + } + + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; + if (a) + c.mcast_port = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; + if (a) + c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ - if (net_ipvs(net)->mixed_address_family_dests > 0) + if (ipvs->mixed_address_family_dests > 0) return -EINVAL; - return start_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(ipvs, &c, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + return ret; } -static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { + int ret; + if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(ipvs, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + return ret; } -static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3364,38 +3470,33 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(net, &t); + return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - int ret = 0, cmd; - struct net *net; - struct netns_ipvs *ipvs; + int ret = -EINVAL, cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); - ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; + ip_vs_daemon_policy)) goto out; - } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(net, daemon_attrs); + ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else - ret = ip_vs_genl_del_daemon(net, daemon_attrs); -out: - mutex_unlock(&ipvs->sync_mutex); + ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } + +out: return ret; } @@ -3406,22 +3507,22 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(net, false); + ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(net, info->attrs); + ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(net); + ret = ip_vs_zero_all(ipvs); goto out; } @@ -3431,7 +3532,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(net, &usvc, + ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) @@ -3470,7 +3571,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* The synchronization protocol is incompatible * with mixed family services */ - if (net_ipvs(net)->sync_state) { + if (ipvs->sync_state) { ret = -EINVAL; goto out; } @@ -3490,7 +3591,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(net, &usvc, &svc); + ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; @@ -3528,9 +3629,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; - struct net *net; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); - net = skb_sknet(skb); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3559,7 +3660,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(net, + svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); @@ -3580,7 +3681,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(net, &t); + __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || @@ -3735,10 +3836,10 @@ static void ip_vs_genl_unregister(void) * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int idx; - struct netns_ipvs *ipvs = net_ipvs(net); struct ctl_table *tbl; atomic_set(&ipvs->dropentry, 0); @@ -3757,6 +3858,10 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) } else tbl = vs_vars; /* Initialize sysctl defaults */ + for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { + if (tbl[idx].proc_handler == proc_do_defense_mode) + tbl[idx].extra2 = ipvs; + } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; @@ -3798,7 +3903,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; - + tbl[idx++].data = &ipvs->sysctl_schedule_icmp; + tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { @@ -3806,7 +3912,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) kfree(tbl); return -ENOMEM; } - ip_vs_start_estimator(net, &ipvs->tot_stats); + ip_vs_start_estimator(ipvs, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -3815,14 +3921,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(net, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -3830,8 +3936,8 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) #else -static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif @@ -3839,10 +3945,10 @@ static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, }; -int __net_init ip_vs_control_net_init(struct net *net) +int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { + struct net *net = ipvs->net; int i, idx; - struct netns_ipvs *ipvs = net_ipvs(net); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -3851,7 +3957,7 @@ int __net_init ip_vs_control_net_init(struct net *net) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, - (unsigned long) net); + (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); @@ -3873,7 +3979,7 @@ int __net_init ip_vs_control_net_init(struct net *net) proc_create("ip_vs_stats_percpu", 0, net->proc_net, &ip_vs_stats_percpu_fops); - if (ip_vs_control_net_init_sysctl(net)) + if (ip_vs_control_net_init_sysctl(ipvs)) goto err; return 0; @@ -3883,12 +3989,12 @@ err: return -ENOMEM; } -void __net_exit ip_vs_control_net_cleanup(struct net *net) +void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct net *net = ipvs->net; - ip_vs_trash_cleanup(net); - ip_vs_control_net_cleanup_sysctl(net); + ip_vs_trash_cleanup(ipvs); + ip_vs_control_net_cleanup_sysctl(ipvs); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); remove_proc_entry("ip_vs", net->proc_net); diff --git a/kernel/net/netfilter/ipvs/ip_vs_est.c b/kernel/net/netfilter/ipvs/ip_vs_est.c index ef0eb0a8d..457c6c193 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_est.c +++ b/kernel/net/netfilter/ipvs/ip_vs_est.c @@ -102,10 +102,8 @@ static void estimation_timer(unsigned long arg) struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; - struct net *net = (struct net *)arg; - struct netns_ipvs *ipvs; + struct netns_ipvs *ipvs = (struct netns_ipvs *)arg; - ipvs = net_ipvs(net); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -140,9 +138,8 @@ static void estimation_timer(unsigned long arg) mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); @@ -152,9 +149,8 @@ void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; spin_lock_bh(&ipvs->est_lock); @@ -192,18 +188,16 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) dst->outbps = (e->outbps + 0xF) >> 5; } -int __net_init ip_vs_estimator_net_init(struct net *net) +int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); - setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)ipvs); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } -void __net_exit ip_vs_estimator_net_cleanup(struct net *net) +void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&net_ipvs(net)->est_timer); + del_timer_sync(&ipvs->est_timer); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_ftp.c b/kernel/net/netfilter/ipvs/ip_vs_ftp.c index 5d3daae98..d30c327bb 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_ftp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_ftp.c @@ -181,7 +181,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; - struct net *net; *diff = 0; @@ -223,14 +222,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), + ip_vs_conn_fill_param(cp->ipvs, AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); /* As above, this is ipv4 only */ @@ -289,9 +288,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * would be adjusted twice. */ - net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -320,7 +318,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; - struct net *net; /* no diff required for incoming packets */ *diff = 0; @@ -392,7 +389,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + ip_vs_conn_fill_param(cp->ipvs, AF_INET, iph->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); @@ -413,8 +410,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Move tunnel to listen state */ - net = skb_net(skb); - ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; @@ -447,14 +443,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net) if (!ipvs) return -ENOENT; - app = register_ip_vs_app(net, &ip_vs_ftp); + app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; - ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); + ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; pr_info("%s: loaded support on port[%d] = %d\n", @@ -463,7 +459,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) return 0; err_unreg: - unregister_ip_vs_app(net, &ip_vs_ftp); + unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* @@ -471,7 +467,12 @@ err_unreg: */ static void __ip_vs_ftp_exit(struct net *net) { - unregister_ip_vs_app(net, &ip_vs_ftp); + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return; + + unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblc.c b/kernel/net/netfilter/ipvs/ip_vs_lblc.c index 127f14046..cccf4d637 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_lblc.c +++ b/kernel/net/netfilter/ipvs/ip_vs_lblc.c @@ -250,8 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) static int sysctl_lblc_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblc_expiration; + return svc->ipvs->sysctl_lblc_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c index 2229d2d8b..796d70e47 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/kernel/net/netfilter/ipvs/ip_vs_lblcr.c @@ -415,8 +415,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) static int sysctl_lblcr_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL - struct netns_ipvs *ipvs = net_ipvs(svc->net); - return ipvs->sysctl_lblcr_expiration; + return svc->ipvs->sysctl_lblcr_expiration; #else return DEFAULT_EXPIRATION; #endif diff --git a/kernel/net/netfilter/ipvs/ip_vs_nfct.c b/kernel/net/netfilter/ipvs/ip_vs_nfct.c index 5882bbfd1..30434fb13 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_nfct.c +++ b/kernel/net/netfilter/ipvs/ip_vs_nfct.c @@ -161,7 +161,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); @@ -274,8 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, - &tuple); + h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); /* Show what happens instead of calling nf_ct_kill() */ diff --git a/kernel/net/netfilter/ipvs/ip_vs_ovf.c b/kernel/net/netfilter/ipvs/ip_vs_ovf.c new file mode 100644 index 000000000..f7d62c3b7 --- /dev/null +++ b/kernel/net/netfilter/ipvs/ip_vs_ovf.c @@ -0,0 +1,86 @@ +/* + * IPVS: Overflow-Connection Scheduling module + * + * Authors: Raducu Deaconu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Scheduler implements "overflow" loadbalancing according to number of active + * connections , will keep all conections to the node with the highest weight + * and overflow to the next node if the number of connections exceeds the node's + * weight. + * Note that this scheduler might not be suitable for UDP because it only uses + * active connections + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* OVF Connection scheduling */ +static struct ip_vs_dest * +ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *h = NULL; + int hw = 0, w; + + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); + /* select the node with highest weight, go to next in line if active + * connections exceed weight + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + w = atomic_read(&dest->weight); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->activeconns) > w || + w == 0) + continue; + if (!h || w > hw) { + h = dest; + hw = w; + } + } + + if (h) { + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", + IP_VS_DBG_ADDR(h->af, &h->addr), + ntohs(h->port), + atomic_read(&h->activeconns), + atomic_read(&h->weight)); + return h; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_ovf_scheduler = { + .name = "ovf", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), + .schedule = ip_vs_ovf_schedule, +}; + +static int __init ip_vs_ovf_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); +} + +static void __exit ip_vs_ovf_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_ovf_init); +module_exit(ip_vs_ovf_cleanup); +MODULE_LICENSE("GPL"); diff --git a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c index bed5f7042..1b8d594e4 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/kernel/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -70,7 +70,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iph_skb(p->af, skb, &iph); + ip_vs_fill_iph_skb(p->af, skb, false, &iph); /* Only useful with UDP */ if (iph.protocol != IPPROTO_UDP) diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto.c b/kernel/net/netfilter/ipvs/ip_vs_proto.c index 939f7fbe9..8ae480715 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto.c @@ -63,9 +63,8 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) * register an ipvs protocols netns related data */ static int -register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { - struct netns_ipvs *ipvs = net_ipvs(net); unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); @@ -79,7 +78,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { - int ret = pp->init_netns(net, pd); + int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; @@ -116,9 +115,8 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) * unregister an ipvs protocols netns data */ static int -unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); @@ -127,7 +125,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) - pd->pp->exit_netns(net, pd); + pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } @@ -156,8 +154,8 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -static struct ip_vs_proto_data * -__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); @@ -169,14 +167,6 @@ __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) return NULL; } - -struct ip_vs_proto_data * -ip_vs_proto_data_get(struct net *net, unsigned short proto) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - - return __ipvs_proto_data_get(ipvs, proto); -} EXPORT_SYMBOL(ip_vs_proto_data_get); /* @@ -317,7 +307,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -int __net_init ip_vs_protocol_net_init(struct net *net) +int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { @@ -339,27 +329,26 @@ int __net_init ip_vs_protocol_net_init(struct net *net) }; for (i = 0; i < ARRAY_SIZE(protos); i++) { - ret = register_ip_vs_proto_netns(net, protos[i]); + ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: - ip_vs_protocol_net_cleanup(net); + ip_vs_protocol_net_cleanup(ipvs); return ret; } -void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) - unregister_ip_vs_proto_netns(net, pd); + unregister_ip_vs_proto_netns(ipvs, pd); } } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5de3dd312..5320d3997 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,30 +41,28 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 static void -ah_esp_conn_fill_param_proto(struct net *net, int af, - const struct ip_vs_iphdr *iph, int inverse, +ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, + const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { - if (likely(!inverse)) - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else - ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, - int inverse) +ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* @@ -73,7 +71,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -84,19 +82,18 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - const struct ip_vs_iphdr *iph, int inverse) +ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; - struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", - inverse ? "ICMP+" : "", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); @@ -107,7 +104,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c index 5b84c0b56..010ddeec1 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,35 +9,44 @@ #include static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; - struct netns_ipvs *ipvs; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; - - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (sh == NULL) { - *verdict = NF_DROP; - return 0; + __be16 _ports[2], *ports = NULL; + + if (likely(!ip_vs_iph_icmp(iph))) { + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh) { + sch = skb_header_pointer( + skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch && (sch->type == SCTP_CID_INIT || + sysctl_sloppy_sctp(ipvs))) + ports = &sh->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); } - sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), - sizeof(_schunkh), &_schunkh); - if (sch == NULL) { + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); rcu_read_lock(); - if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, sh->dest))) { + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -474,14 +483,13 @@ static inline __u16 sctp_app_hashkey(__be16 port) & SCTP_APP_TAB_MASK; } -static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); @@ -498,9 +506,9 @@ out: return ret; } -static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -508,7 +516,7 @@ static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) static int sctp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -549,10 +557,8 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); @@ -561,7 +567,7 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c index 8e92beb0c..d7024b2ed 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -32,27 +32,47 @@ #include static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; - struct netns_ipvs *ipvs; + __be16 _ports[2], *ports = NULL; - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (th == NULL) { + /* In the event of icmp, we're only guaranteed to have the first 8 + * bytes of the transport header, so we only check the rest of the + * TCP packet for non-ICMP packets + */ + if (likely(!ip_vs_iph_icmp(iph))) { + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th) { + if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) + return 1; + ports = &th->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); - ipvs = net_ipvs(net); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ rcu_read_lock(); - if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst && - (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, th->dest))) { + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + + if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { @@ -571,14 +591,13 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); @@ -597,9 +616,9 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc) static void -tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -609,7 +628,7 @@ tcp_unregister_app(struct net *net, struct ip_vs_app *inc) static int tcp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -653,9 +672,9 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; @@ -668,10 +687,8 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) * timeouts is netns related now. * --------------------------------------------- */ -static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); @@ -681,7 +698,7 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c index b62a3c0ff..e494e9a88 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/kernel/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -29,28 +29,42 @@ #include static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, +udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { - struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; + __be16 _ports[2], *ports = NULL; - /* IPv6 fragments, only first fragment will hit this */ - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (uh == NULL) { + if (likely(!ip_vs_iph_icmp(iph))) { + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (uh) + ports = &uh->source; + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { *verdict = NF_DROP; return 0; } - net = skb_net(skb); + rcu_read_lock(); - svc = ip_vs_service_find(net, af, skb->mark, iph->protocol, - &iph->daddr, uh->dest); + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { int ignored; - if (ip_vs_todrop(net_ipvs(net))) { + if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -348,14 +362,13 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct net *net, struct ip_vs_app *inc) +static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; - struct netns_ipvs *ipvs = net_ipvs(net); - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); hash = udp_app_hashkey(port); @@ -374,9 +387,9 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc) static void -udp_unregister_app(struct net *net, struct ip_vs_app *inc) +udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { - struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); @@ -385,7 +398,7 @@ udp_unregister_app(struct net *net, struct ip_vs_app *inc) static int udp_app_conn_bind(struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; @@ -456,10 +469,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { - struct netns_ipvs *ipvs = net_ipvs(net); - ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); @@ -468,7 +479,7 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) return 0; } -static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } diff --git a/kernel/net/netfilter/ipvs/ip_vs_sched.c b/kernel/net/netfilter/ipvs/ip_vs_sched.c index 199760c71..a2ff7d746 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sched.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sched.c @@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can not be set to NULL */ + /* svc->scheduler can be set to NULL only by caller */ } @@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler && scheduler->module) + if (scheduler) module_put(scheduler->module); } @@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; - sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - sched->name, svc->fwmark, svc->fwmark, msg); + sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } diff --git a/kernel/net/netfilter/ipvs/ip_vs_sh.c b/kernel/net/netfilter/ipvs/ip_vs_sh.c index 98a13433b..1e373a5e4 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sh.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sh.c @@ -280,35 +280,29 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, static inline __be16 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - __be16 port; - struct tcphdr _tcph, *th; - struct udphdr _udph, *uh; - sctp_sctphdr_t _sctph, *sh; + __be16 _ports[2], *ports; + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ switch (iph->protocol) { case IPPROTO_TCP: - th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); - if (unlikely(th == NULL)) - return 0; - port = th->source; - break; case IPPROTO_UDP: - uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); - if (unlikely(uh == NULL)) - return 0; - port = uh->source; - break; case IPPROTO_SCTP: - sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (unlikely(sh == NULL)) + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) return 0; - port = sh->source; - break; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; default: - port = 0; + return 0; } - - return port; } @@ -322,6 +316,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_dest *dest; struct ip_vs_sh_state *s; __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); @@ -331,9 +328,9 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, s = (struct ip_vs_sh_state *) svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) - dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); else - dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + dest = ip_vs_sh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); @@ -341,7 +338,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", - IP_VS_DBG_ADDR(svc->af, &iph->saddr), + IP_VS_DBG_ADDR(svc->af, hash_addr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); diff --git a/kernel/net/netfilter/ipvs/ip_vs_sync.c b/kernel/net/netfilter/ipvs/ip_vs_sync.c index 19b9cce6c..803001a45 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_sync.c +++ b/kernel/net/netfilter/ipvs/ip_vs_sync.c @@ -193,7 +193,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { - struct net *net; + struct netns_ipvs *ipvs; struct socket *sock; char *buf; int id; @@ -262,6 +262,11 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; +union ipvs_sockaddr { + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -320,26 +325,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) * Create a new sync buffer for Version 1 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; - sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->syncid = ipvs->mcfg.syncid; sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); - sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)sb->mesg + len; sb->firstuse = jiffies; return sb; @@ -402,7 +409,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) * Create a new sync buffer for Version 0 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; @@ -410,17 +417,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; - mesg->syncid = ipvs->master_syncid; + mesg->syncid = ipvs->mcfg.syncid; mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); - sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)mesg + len; sb->firstuse = jiffies; return sb; } @@ -524,16 +533,15 @@ set: * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, +static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_buff *buff; struct ipvs_master_sync_state *ms; int id; - int len; + unsigned int len; if (unlikely(cp->af != AF_INET)) return; @@ -553,17 +561,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, id = select_master_thread_id(ipvs, cp); ms = &ipvs->ms[id]; buff = ms->sync_buff; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; if (buff) { m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; /* Send buffer if it is for v1 */ - if (!m->nr_conns) { + if (buff->head + len > buff->end || !m->nr_conns) { sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; buff = NULL; } } if (!buff) { - buff = ip_vs_sync_buff_create_v0(ipvs); + buff = ip_vs_sync_buff_create_v0(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -572,8 +582,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, ms->sync_buff = buff; } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; s = (struct ip_vs_sync_conn_v0 *) buff->head; @@ -597,12 +605,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, m->nr_conns++; m->size = htons(ntohs(m->size) + len); buff->head += len; - - /* check if there is a space for next one */ - if (buff->head + FULL_CONN_SIZE > buff->end) { - sb_queue_tail(ipvs, ms); - ms->sync_buff = NULL; - } spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ @@ -612,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp->control, pkts); + ip_vs_sync_conn(ipvs, cp, pkts); } } @@ -621,9 +623,8 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) +void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; struct ip_vs_sync_buff *buff; @@ -634,7 +635,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp, pkts); + ip_vs_sync_conn_v0(ipvs, cp, pkts); return; } /* Do not sync ONE PACKET */ @@ -694,7 +695,7 @@ sloop: } if (!buff) { - buff = ip_vs_sync_buff_create(ipvs); + buff = ip_vs_sync_buff_create(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -781,21 +782,21 @@ control: * fill_param used by version 1 */ static inline int -ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, +ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, struct ip_vs_conn_param *p, __u8 *pe_data, unsigned int pe_data_len, __u8 *pe_name, unsigned int pe_name_len) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - ip_vs_conn_fill_param(net, af, sc->v6.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, (const union nf_inet_addr *)&sc->v6.caddr, sc->v6.cport, (const union nf_inet_addr *)&sc->v6.vaddr, sc->v6.vport, p); else #endif - ip_vs_conn_fill_param(net, af, sc->v4.protocol, + ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, (const union nf_inet_addr *)&sc->v4.caddr, sc->v4.cport, (const union nf_inet_addr *)&sc->v4.vaddr, @@ -834,7 +835,7 @@ ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, * Param: ... * timeout is in sec. */ -static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, +static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, unsigned int flags, unsigned int state, unsigned int protocol, unsigned int type, const union nf_inet_addr *daddr, __be16 dport, @@ -843,7 +844,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, { struct ip_vs_dest *dest; struct ip_vs_conn *cp; - struct netns_ipvs *ipvs = net_ipvs(net); if (!(flags & IP_VS_CONN_F_TEMPLATE)) { cp = ip_vs_conn_in_get(param); @@ -901,7 +901,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * with synchronization, so we can make the assumption that * the svc_af is the same as the dest_af */ - dest = ip_vs_find_dest(net, type, type, daddr, dport, + dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); @@ -938,7 +938,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, } else { struct ip_vs_proto_data *pd; - pd = ip_vs_proto_data_get(net, protocol); + pd = ip_vs_proto_data_get(ipvs, protocol); if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) cp->timeout = pd->timeout_table[state]; else @@ -950,7 +950,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, /* * Process received multicast message for Version 0 */ -static void ip_vs_process_message_v0(struct net *net, const char *buffer, +static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; @@ -1006,14 +1006,14 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, } } - ip_vs_conn_fill_param(net, AF_INET, s->protocol, + ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, (const union nf_inet_addr *)&s->caddr, s->cport, (const union nf_inet_addr *)&s->vaddr, s->vport, ¶m); /* Send timeout as Zero */ - ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, (union nf_inet_addr *)&s->daddr, s->dport, 0, 0, opt); } @@ -1064,7 +1064,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, /* * Process a Version 1 sync. connection */ -static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) { struct ip_vs_sync_conn_options opt; union ip_vs_sync_conn *s; @@ -1168,21 +1168,21 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) state = 0; } } - if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { retc = 50; goto out; } /* If only IPv4, just silent skip IPv6 */ if (af == AF_INET) - ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, ntohl(s->v4.timeout), ntohl(s->v4.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #ifdef CONFIG_IP_VS_IPV6 else - ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, ntohl(s->v6.timeout), ntohl(s->v6.fwmark), (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) @@ -1201,10 +1201,9 @@ out: * ip_vs_conn entries. * Handles Version 0 & 1 */ -static void ip_vs_process_message(struct net *net, __u8 *buffer, +static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, const size_t buflen) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; __u8 *p, *msg_end; int i, nr_conns; @@ -1219,7 +1218,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* SyncID sanity check */ - if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); return; } @@ -1254,7 +1253,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* Process a single sync_conn */ - retc = ip_vs_proc_sync_conn(net, p, msg_end); + retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); if (retc < 0) { IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", retc); @@ -1265,7 +1264,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, } } else { /* Old type of message */ - ip_vs_process_message_v0(net, buffer, buflen); + ip_vs_process_message_v0(ipvs, buffer, buflen); return; } } @@ -1303,6 +1302,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop) /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); inet->mc_loop = loop ? 1 : 0; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_LOOP */ + np->mc_loop = loop ? 1 : 0; + } +#endif release_sock(sk); } @@ -1316,6 +1323,33 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); inet->mc_ttl = ttl; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_HOPS */ + np->mcast_hops = ttl; + } +#endif + release_sock(sk); +} + +/* Control fragmentation of messages */ +static void set_mcast_pmtudisc(struct sock *sk, int val) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ + lock_sock(sk); + inet->pmtudisc = val; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MTU_DISCOVER */ + np->pmtudisc = val; + } +#endif release_sock(sk); } @@ -1338,44 +1372,15 @@ static int set_mcast_if(struct sock *sk, char *ifname) lock_sock(sk); inet->mc_index = dev->ifindex; /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(struct net *net, int sync_state) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); - if (!dev) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", ipvs->send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); - if (!dev) - return -ENODEV; - - ipvs->recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", ipvs->recv_mesg_maxlen); + /* IPV6_MULTICAST_IF */ + np->mcast_oif = dev->ifindex; } +#endif + release_sock(sk); return 0; } @@ -1405,15 +1410,34 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; - rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); - rtnl_unlock(); return ret; } +#ifdef CONFIG_IP_VS_IPV6 +static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, + char *ifname) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + int ret; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); + release_sock(sk); + + return ret; +} +#endif static int bind_mcastif_addr(struct socket *sock, char *ifname) { @@ -1442,53 +1466,69 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); } +static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, + struct ipvs_sync_daemon_cfg *c, int id) +{ + if (AF_INET6 == c->mcast_af) { + sa->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_port = htons(c->mcast_port + id), + }; + sa->in6.sin6_addr = c->mcast_group.in6; + *salen = sizeof(sa->in6); + } else { + sa->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(c->mcast_port + id), + }; + sa->in.sin_addr = c->mcast_group.in; + *salen = sizeof(sa->in); + } +} + /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net, int id) +static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; - /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); - result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; } set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); + /* Allow fragmentation if MTU changes */ + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (AF_INET == ipvs->mcfg.mcast_af) + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + else + result = 0; if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; } + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1497,7 +1537,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1505,47 +1545,42 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) { - struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 0, result); - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ipvs->backup_mcast_ifn); +#ifdef CONFIG_IP_VS_IPV6 + if (ipvs->bcfg.mcast_af == AF_INET6) + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, + ipvs->bcfg.mcast_ifn); + else +#endif + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, + ipvs->bcfg.mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1554,7 +1589,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1646,14 +1681,14 @@ next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); for (;;) { sb = next_sync_buff(ipvs, ms); @@ -1692,7 +1727,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1702,12 +1737,12 @@ done: static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; - struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct netns_ipvs *ipvs = tinfo->ipvs; int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1717,19 +1752,19 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - ipvs->recv_mesg_maxlen); + ipvs->bcfg.sync_maxlen); if (len <= 0) { if (len != -EAGAIN) pr_err("receiving message error\n"); break; } - ip_vs_process_message(tinfo->net, tinfo->buf, len); + ip_vs_process_message(ipvs, tinfo->buf, len); } } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1737,16 +1772,18 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + int state) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **array = NULL, *task; struct socket *sock; - struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; char *name; int (*threadfn)(void *data); - int id, count; + int id, count, hlen; int result = -ENOMEM; + u16 mtu, min_mtu; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", @@ -1758,22 +1795,46 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) } else count = ipvs->threads_mask + 1; + if (c->mcast_af == AF_UNSPEC) { + c->mcast_af = AF_INET; + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); + } + if (!c->mcast_port) + c->mcast_port = IP_VS_SYNC_PORT; + if (!c->mcast_ttl) + c->mcast_ttl = 1; + + dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); + if (!dev) { + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); + return -ENODEV; + } + hlen = (AF_INET6 == c->mcast_af) ? + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : + sizeof(struct iphdr) + sizeof(struct udphdr); + mtu = (state == IP_VS_STATE_BACKUP) ? + clamp(dev->mtu, 1500U, 65535U) : 1500U; + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; + + if (c->sync_maxlen) + c->sync_maxlen = clamp_t(unsigned int, + c->sync_maxlen, min_mtu, + 65535 - hlen); + else + c->sync_maxlen = mtu - hlen; + if (state == IP_VS_STATE_MASTER) { if (ipvs->ms) return -EEXIST; - strlcpy(ipvs->master_mcast_ifn, mcast_ifn, - sizeof(ipvs->master_mcast_ifn)); - ipvs->master_syncid = syncid; + ipvs->mcfg = *c; name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { if (ipvs->backup_threads) return -EEXIST; - strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, - sizeof(ipvs->backup_mcast_ifn)); - ipvs->backup_syncid = syncid; + ipvs->bcfg = *c; name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; } else { @@ -1801,14 +1862,13 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) if (!array) goto out; } - set_sync_mesg_maxlen(net, state); tinfo = NULL; for (id = 0; id < count; id++) { if (state == IP_VS_STATE_MASTER) - sock = make_send_sock(net, id); + sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(net, id); + sock = make_receive_sock(ipvs, id); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; @@ -1816,10 +1876,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) goto outsocket; - tinfo->net = net; + tinfo->ipvs = ipvs; tinfo->sock = sock; if (state == IP_VS_STATE_BACKUP) { - tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto outtinfo; @@ -1854,11 +1914,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } @@ -1880,9 +1940,8 @@ out: } -int stop_sync_thread(struct net *net, int state) +int stop_sync_thread(struct netns_ipvs *ipvs, int state) { - struct netns_ipvs *ipvs = net_ipvs(net); struct task_struct **array; int id; int retc = -EINVAL; @@ -1948,27 +2007,24 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -int __net_init ip_vs_sync_net_init(struct net *net) +int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) { - struct netns_ipvs *ipvs = net_ipvs(net); - __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); return 0; } -void ip_vs_sync_net_cleanup(struct net *net) +void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) { int retc; - struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); - retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); - retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); mutex_unlock(&ipvs->sync_mutex); diff --git a/kernel/net/netfilter/ipvs/ip_vs_xmit.c b/kernel/net/netfilter/ipvs/ip_vs_xmit.c index 19986ec5f..3264cb49b 100644 --- a/kernel/net/netfilter/ipvs/ip_vs_xmit.c +++ b/kernel/net/netfilter/ipvs/ip_vs_xmit.c @@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -213,19 +212,20 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } -static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, +static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, + int rt_mode, struct ip_vs_iphdr *ipvsh, struct sk_buff *skb, int mtu) { #ifdef CONFIG_IP_VS_IPV6 if (skb_af == AF_INET6) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { if (!skb->dev) skb->dev = net->loopback_dev; /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) + if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); @@ -234,8 +234,6 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, } else #endif { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); - /* If we're going to tunnel the packet and pmtu discovery * is disabled, we'll just fragment it anyway */ @@ -243,7 +241,8 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, return true; if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len > mtu && !skb_is_gso(skb))) { + skb->len > mtu && !skb_is_gso(skb) && + !ip_vs_iph_icmp(ipvsh))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG(1, "frag needed for %pI4\n", @@ -257,11 +256,12 @@ static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr, struct ip_vs_iphdr *ipvsh) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ int mtu; @@ -337,7 +337,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -364,13 +364,16 @@ err_unreach: #ifdef CONFIG_IP_VS_IPV6 static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, - struct in6_addr *ret_saddr, int do_xfrm) + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; @@ -400,11 +403,12 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = ipvs->net; struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ struct dst_entry *dst; @@ -427,7 +431,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest_dst->dst_saddr.in6, - do_xfrm); + do_xfrm, rt_mode); if (!dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); @@ -435,7 +439,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", @@ -446,7 +450,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; - dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); if (!dst) goto err_unreach; rt = (struct rt6_info *) dst; @@ -481,7 +486,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, maybe_update_pmtu(skb_af, skb, mtu); } - if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; skb_dst_drop(skb); @@ -501,6 +506,13 @@ err_put: return -1; err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } @@ -519,10 +531,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); } return ret; } +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) @@ -534,12 +563,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + if (!local) { skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; + return ret; } @@ -553,9 +593,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); if (!local) { + ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, - NULL, skb_dst(skb)->dev, dst_output_sk); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; return ret; @@ -588,7 +631,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; @@ -615,10 +658,13 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct ipv6hdr *iph = ipv6_hdr(skb); + EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, + &iph->daddr, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -665,7 +711,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL, ipvsh); @@ -682,7 +728,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error; @@ -692,8 +738,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { - IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " - "stopping DNAT to loopback address"); + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, + "ip_vs_nat_xmit(): stopping DNAT to loopback " + "address"); goto tx_error; } @@ -710,7 +757,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); - IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -753,7 +800,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -771,7 +819,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error; @@ -781,8 +829,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error; @@ -800,7 +848,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; - IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -841,6 +889,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, struct ipv6hdr *old_ipv6h = NULL; #endif + ip_vs_drop_early_demux_sk(skb); + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) @@ -924,8 +974,8 @@ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct net *net = skb_net(skb); - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -941,7 +991,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | @@ -999,7 +1049,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1035,7 +1085,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -1090,7 +1141,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(skb); + ip6_local_out(cp->ipvs->net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); rcu_read_unlock(); @@ -1122,7 +1173,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); @@ -1161,10 +1212,12 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL); + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; if (local) { @@ -1229,7 +1282,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, NULL, iph); if (local < 0) goto tx_error; @@ -1321,8 +1374,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, - NULL, ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); @@ -1346,7 +1399,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); diff --git a/kernel/net/netfilter/nf_conntrack_core.c b/kernel/net/netfilter/nf_conntrack_core.c index 13fad8668..3cb3cb831 100644 --- a/kernel/net/netfilter/nf_conntrack_core.c +++ b/kernel/net/netfilter/nf_conntrack_core.c @@ -126,7 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); unsigned int nf_conntrack_hash_rnd __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) { unsigned int n; @@ -135,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) * three bytes manually. */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } @@ -151,15 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net) } static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - u16 zone, unsigned int size) + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple, zone), size); + return __hash_bucket(hash_conntrack_raw(tuple), size); } -static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, +static inline u_int32_t hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, zone, net->ct.htable_size); + return __hash_conntrack(tuple, net->ct.htable_size); } bool @@ -168,6 +168,7 @@ nf_ct_get_tuple(const struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, + struct net *net, struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) @@ -181,12 +182,13 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - return l4proto->pkt_to_tuple(skb, dataoff, tuple); + return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, - u_int16_t l3num, struct nf_conntrack_tuple *tuple) + u_int16_t l3num, + struct net *net, struct nf_conntrack_tuple *tuple) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -205,7 +207,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); - ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, l3proto, l4proto); rcu_read_unlock(); @@ -287,6 +289,40 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) spin_unlock(&pcpu->lock); } +/* Released via destroy_conntrack() */ +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, + const struct nf_conntrack_zone *zone, + gfp_t flags) +{ + struct nf_conn *tmpl; + + tmpl = kzalloc(sizeof(*tmpl), flags); + if (tmpl == NULL) + return NULL; + + tmpl->status = IPS_TEMPLATE; + write_pnet(&tmpl->ct_net, net); + + if (nf_ct_zone_add(tmpl, flags, zone) < 0) + goto out_free; + + atomic_set(&tmpl->ct_general.use, 0); + + return tmpl; +out_free: + kfree(tmpl); + return NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); + +void nf_ct_tmpl_free(struct nf_conn *tmpl) +{ + nf_ct_ext_destroy(tmpl); + nf_ct_ext_free(tmpl); + kfree(tmpl); +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -298,6 +334,10 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); + if (unlikely(nf_ct_is_template(ct))) { + nf_ct_tmpl_free(ct); + return; + } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -329,7 +369,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; - u16 zone = nf_ct_zone(ct); unsigned int sequence; nf_ct_helper_destroy(ct); @@ -337,9 +376,9 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -387,8 +426,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, - const struct nf_conntrack_tuple *tuple, - u16 zone) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -396,8 +435,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone && - nf_ct_is_confirmed(ct); + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && + nf_ct_is_confirmed(ct); } /* @@ -406,7 +445,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * -____nf_conntrack_find(struct net *net, u16 zone, +____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -442,7 +481,7 @@ begin: /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * -__nf_conntrack_find_get(struct net *net, u16 zone, +__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -469,11 +508,11 @@ begin: } struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, zone)); + hash_conntrack_raw(tuple)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -492,11 +531,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, int nf_conntrack_hash_check_insert(struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - u16 zone; unsigned int sequence; zone = nf_ct_zone(ct); @@ -504,9 +543,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -514,12 +553,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; add_timer(&ct->timeout); @@ -540,32 +581,11 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -/* deletion from this larval template list happens via nf_ct_put() */ -void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) -{ - struct ct_pcpu *pcpu; - - __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); - - /* add this conntrack to the (per cpu) tmpl list */ - local_bh_disable(); - tmpl->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); - - spin_lock(&pcpu->lock); - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->tmpl); - spin_unlock_bh(&pcpu->lock); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); - /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { + const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -574,7 +594,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - u16 zone; unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); @@ -595,7 +614,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = hash_bucket(hash, net); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -627,12 +646,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; /* Timer relative to confirmation time, not original @@ -685,11 +706,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *ct; - u16 zone = nf_ct_zone(ignored_conntrack); - unsigned int hash = hash_conntrack(net, zone, tuple); + unsigned int hash; + + zone = nf_ct_zone(ignored_conntrack); + hash = hash_conntrack(net, tuple); /* Disable BHs the entire time since we need to disable them at * least once for the stats anyway. @@ -699,7 +723,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone) { + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; @@ -788,7 +812,8 @@ void init_nf_conntrack_hash_rnd(void) } static struct nf_conn * -__nf_conntrack_alloc(struct net *net, u16 zone, +__nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) @@ -798,7 +823,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, if (unlikely(!nf_conntrack_hash_rnd)) { init_nf_conntrack_hash_rnd(); /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig, zone); + hash = hash_conntrack_raw(orig); } /* We don't want any race condition at early drop stage */ @@ -818,10 +843,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone, * SLAB_DESTROY_BY_RCU. */ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); - if (ct == NULL) { - atomic_dec(&net->ct.count); - return ERR_PTR(-ENOMEM); - } + if (ct == NULL) + goto out; + spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; @@ -835,31 +859,24 @@ __nf_conntrack_alloc(struct net *net, u16 zone, memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); -#ifdef CONFIG_NF_CONNTRACK_ZONES - if (zone) { - struct nf_conntrack_zone *nf_ct_zone; - nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); - if (!nf_ct_zone) - goto out_free; - nf_ct_zone->id = zone; - } -#endif + if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) + goto out_free; + /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; - -#ifdef CONFIG_NF_CONNTRACK_ZONES out_free: - atomic_dec(&net->ct.count); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +out: + atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); -#endif } -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) @@ -901,8 +918,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp = NULL; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; + struct nf_conntrack_zone tmp; unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -910,6 +928,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return NULL; } + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) @@ -921,10 +940,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (timeout_ext) - timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); - else + if (timeout_ext) { + timeouts = nf_ct_timeout_data(timeout_ext); + if (unlikely(!timeouts)) + timeouts = l4proto->get_timeouts(net); + } else { timeouts = l4proto->get_timeouts(net); + } if (!l4proto->new(ct, skb, dataoff, timeouts)) { nf_conntrack_free(ct); @@ -933,7 +955,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } if (timeout_ext) - nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); + nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), + GFP_ATOMIC); nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); @@ -1004,21 +1027,23 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, int *set_reply, enum ip_conntrack_info *ctinfo) { + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_zone tmp; struct nf_conn *ct; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, &tuple, l3proto, + dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("resolve_normal_ct: Can't get tuple\n"); return NULL; } /* look for tuple match */ - hash = hash_conntrack_raw(&tuple, zone); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); + hash = hash_conntrack_raw(&tuple); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1522,10 +1547,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); - if (!hash) { - printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + if (!hash) hash = vzalloc(sz); - } if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -1576,8 +1599,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), - hashsize); + bucket = __hash_conntrack(&h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } @@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net) spin_lock_init(&pcpu->lock); INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } net->ct.stat = alloc_percpu(struct ip_conntrack_stat); diff --git a/kernel/net/netfilter/nf_conntrack_expect.c b/kernel/net/netfilter/nf_conntrack_expect.c index 7a17070c5..acf5c7b3f 100644 --- a/kernel/net/netfilter/nf_conntrack_expect.c +++ b/kernel/net/netfilter/nf_conntrack_expect.c @@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple } struct nf_conntrack_expect * -__nf_ct_expect_find(struct net *net, u16 zone, +__nf_ct_expect_find(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) + nf_ct_zone_equal_any(i->master, zone)) return i; } return NULL; @@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_ct_expect_find_get(struct net *net, u16 zone, +nf_ct_expect_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -nf_ct_find_expectation(struct net *net, u16 zone, +nf_ct_find_expectation(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; @@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) { + nf_ct_zone_equal_any(i->master, zone)) { exp = i; break; } @@ -219,16 +222,17 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } - return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return a->master == b->master && a->class == b->class && - nf_ct_tuple_equal(&a->tuple, &b->tuple) && - nf_ct_tuple_mask_equal(&a->mask, &b->mask) && - nf_ct_zone(a->master) == nf_ct_zone(b->master); + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } /* Generally a bad idea to call this: could have matched already. */ diff --git a/kernel/net/netfilter/nf_conntrack_h323_main.c b/kernel/net/netfilter/nf_conntrack_h323_main.c index 1d69f5b97..9511af04d 100644 --- a/kernel/net/netfilter/nf_conntrack_h323_main.c +++ b/kernel/net/netfilter/nf_conntrack_h323_main.c @@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { - if (ipv6_addr_equal(rt6_nexthop(rt1), - rt6_nexthop(rt2)) && + if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), + rt6_nexthop(rt2, &fl2.daddr)) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); diff --git a/kernel/net/netfilter/nf_conntrack_labels.c b/kernel/net/netfilter/nf_conntrack_labels.c index bb53f120e..3ce5c314e 100644 --- a/kernel/net/netfilter/nf_conntrack_labels.c +++ b/kernel/net/netfilter/nf_conntrack_labels.c @@ -14,6 +14,8 @@ #include #include +static spinlock_t nf_connlabels_lock; + static unsigned int label_bits(const struct nf_conn_labels *l) { unsigned int longs = l->words; @@ -48,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static void replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -89,7 +90,35 @@ int nf_connlabels_replace(struct nf_conn *ct, return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -#endif + +int nf_connlabels_get(struct net *net, unsigned int n_bits) +{ + size_t words; + + if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + return -ERANGE; + + words = BITS_TO_LONGS(n_bits); + + spin_lock(&nf_connlabels_lock); + net->ct.labels_used++; + if (words > net->ct.label_words) + net->ct.label_words = words; + spin_unlock(&nf_connlabels_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_get); + +void nf_connlabels_put(struct net *net) +{ + spin_lock(&nf_connlabels_lock); + net->ct.labels_used--; + if (net->ct.labels_used == 0) + net->ct.label_words = 0; + spin_unlock(&nf_connlabels_lock); +} +EXPORT_SYMBOL_GPL(nf_connlabels_put); static struct nf_ct_ext_type labels_extend __read_mostly = { .len = sizeof(struct nf_conn_labels), @@ -99,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/kernel/net/netfilter/nf_conntrack_netlink.c b/kernel/net/netfilter/nf_conntrack_netlink.c index d1c23940a..9f5272968 100644 --- a/kernel/net/netfilter/nf_conntrack_netlink.c +++ b/kernel/net/netfilter/nf_conntrack_netlink.c @@ -127,6 +127,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb, return ret; } +static inline int +ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, + const struct nf_conntrack_zone *zone, int dir) +{ + if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) + return 0; + if (nla_put_be16(skb, attrtype, htons(zone->id))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { @@ -458,6 +472,7 @@ static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; @@ -473,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -485,10 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || @@ -598,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) @@ -609,6 +632,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { + const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -655,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) nfmsg->res_id = 0; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -667,10 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) @@ -920,15 +952,54 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, return ret; } +static int +ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone) +{ + nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (attr) + zone->id = ntohs(nla_get_be16(attr)); +#else + if (attr) + return -EOPNOTSUPP; +#endif + return 0; +} + +static int +ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, + struct nf_conntrack_zone *zone) +{ + int ret; + + if (zone->id != NF_CT_DEFAULT_ZONE_ID) + return -EINVAL; + + ret = ctnetlink_parse_zone(attr, zone); + if (ret < 0) + return ret; + + if (type == CTA_TUPLE_REPLY) + zone->dir = NF_CT_ZONE_DIR_REPL; + else + zone->dir = NF_CT_ZONE_DIR_ORIG; + + return 0; +} + static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, + [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num) + enum ctattr_type type, u_int8_t l3num, + struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -955,6 +1026,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; + if (tb[CTA_TUPLE_ZONE]) { + if (!zone) + return -EINVAL; + + err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], + type, zone); + if (err < 0) + return err; + } + /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; @@ -964,21 +1045,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } -static int -ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) -{ - if (attr) -#ifdef CONFIG_NF_CONNTRACK_ZONES - *zone = ntohs(nla_get_be16(attr)); -#else - return -EOPNOTSUPP; -#endif - else - *zone = 0; - - return 0; -} - static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1058,7 +1124,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1066,9 +1132,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else { return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, @@ -1078,7 +1146,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1112,7 +1180,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2 = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -1138,16 +1206,18 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else return -EINVAL; if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1645,7 +1715,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, } static struct nf_conn * -ctnetlink_create_conntrack(struct net *net, u16 zone, +ctnetlink_create_conntrack(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, @@ -1761,7 +1832,8 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; - err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, + u3, NULL); if (err < 0) goto err2; @@ -1804,7 +1876,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1812,21 +1884,23 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) { - err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, + u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { - err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, + u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, zone, &otuple); + h = nf_conntrack_find_get(net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, zone, &rtuple); + h = nf_conntrack_find_get(net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; @@ -1836,7 +1910,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; - ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2059,9 +2133,9 @@ ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT static size_t -ctnetlink_nfqueue_build_size(const struct nf_conn *ct) +ctnetlink_glue_build_size(const struct nf_conn *ct) { return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ @@ -2082,23 +2156,40 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; } -static int -ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) +static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conn *ct; + + ct = nf_ct_get(skb, ctinfo); + if (ct && nf_ct_is_untracked(ct)) + ct = NULL; + + return ct; +} + +static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -2106,12 +2197,14 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct)) { - if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) - goto nla_put_failure; - } + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) + goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; @@ -2154,7 +2247,32 @@ nla_put_failure: } static int -ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) +ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + u_int16_t ct_attr, u_int16_t ct_info_attr) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (__ctnetlink_glue_build(skb, ct) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static int +ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; @@ -2194,7 +2312,7 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) } static int -ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) +ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; int ret; @@ -2204,31 +2322,31 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) return ret; spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } -static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, - const struct nf_conn *ct, - struct nf_conntrack_tuple *tuple, - struct nf_conntrack_tuple *mask) +static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) { int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); } static int -ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, - u32 portid, u32 report) +ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) { struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; @@ -2240,8 +2358,8 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; - err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, - ct, &tuple, &mask); + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); if (err < 0) return err; @@ -2268,14 +2386,24 @@ ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, return 0; } -static struct nfq_ct_hook ctnetlink_nfqueue_hook = { - .build_size = ctnetlink_nfqueue_build_size, - .build = ctnetlink_nfqueue_build, - .parse = ctnetlink_nfqueue_parse, - .attach_expect = ctnetlink_nfqueue_attach_expect, - .seq_adjust = nf_ct_tcp_seqadj_set, +static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int diff) +{ + if (!(ct->status & IPS_NAT_MASK)) + return; + + nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); +} + +static struct nfnl_ct_hook ctnetlink_glue_hook = { + .get_ct = ctnetlink_glue_get_ct, + .build_size = ctnetlink_glue_build_size, + .build = ctnetlink_glue_build, + .parse = ctnetlink_glue_parse, + .attach_expect = ctnetlink_glue_attach_expect, + .seq_adjust = ctnetlink_glue_seqadj, }; -#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ +#endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */ /*********************************************************************** * EXPECT @@ -2612,23 +2740,22 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - u16 zone = 0; + struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, .done = ctnetlink_exp_done, }; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; - if (cda[CTA_EXPECT_ZONE]) { - err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); - if (err < 0) - return err; - } + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -2652,7 +2779,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -2672,16 +2799,18 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_EXPECT_TUPLE]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); else if (cda[CTA_EXPECT_MASTER]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); else return -EINVAL; if (err < 0) return err; - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2732,8 +2861,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_zone zone; unsigned int i; - u16 zone; int err; if (cda[CTA_EXPECT_TUPLE]) { @@ -2742,12 +2871,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2849,7 +2979,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, - &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); + &nat_tuple, CTA_EXPECT_NAT_TUPLE, + u3, NULL); if (err < 0) return err; @@ -2937,7 +3068,8 @@ err_out: } static int -ctnetlink_create_expect(struct net *net, u16 zone, +ctnetlink_create_expect(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { @@ -2949,13 +3081,16 @@ ctnetlink_create_expect(struct net *net, u16 zone, int err; /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; @@ -2995,11 +3130,6 @@ ctnetlink_create_expect(struct net *net, u16 zone, } err = nf_ct_expect_related_report(exp, portid, report); - if (err < 0) - goto err_exp; - - return 0; -err_exp: nf_ct_expect_put(exp); err_ct: nf_ct_put(ct); @@ -3016,7 +3146,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_expect *exp; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] @@ -3028,19 +3158,18 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, zone, &tuple); - + exp = __nf_ct_expect_find(net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, zone, cda, - u3, + err = ctnetlink_create_expect(net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } @@ -3258,9 +3387,9 @@ static int __init ctnetlink_init(void) pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT /* setup interaction between nf_queue and nf_conntrack_netlink. */ - RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook); + RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook); #endif return 0; @@ -3279,8 +3408,8 @@ static void __exit ctnetlink_exit(void) unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); -#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT - RCU_INIT_POINTER(nfq_ct_hook, NULL); +#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT + RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif } diff --git a/kernel/net/netfilter/nf_conntrack_pptp.c b/kernel/net/netfilter/nf_conntrack_pptp.c index 825c3e3f8..5588c7ae1 100644 --- a/kernel/net/netfilter/nf_conntrack_pptp.c +++ b/kernel/net/netfilter/nf_conntrack_pptp.c @@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, const struct nf_conntrack_tuple *t) { const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_zone *zone; struct nf_conntrack_expect *exp; struct nf_conn *sibling; - u16 zone = nf_ct_zone(ct); pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); + zone = nf_ct_zone(ct); h = nf_conntrack_find_get(net, zone, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); diff --git a/kernel/net/netfilter/nf_conntrack_proto_dccp.c b/kernel/net/netfilter/nf_conntrack_proto_dccp.c index 6dd995c7c..fce1b1cca 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_dccp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_dccp.c @@ -398,7 +398,7 @@ static inline struct dccp_net *dccp_pernet(struct net *net) } static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { struct dccp_hdr _hdr, *dh; diff --git a/kernel/net/netfilter/nf_conntrack_proto_generic.c b/kernel/net/netfilter/nf_conntrack_proto_generic.c index 60865f110..86dc752e5 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_generic.c +++ b/kernel/net/netfilter/nf_conntrack_proto_generic.c @@ -45,7 +45,7 @@ static inline struct nf_generic_net *generic_pernet(struct net *net) static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return nf_generic_should_process(nf_ct_protonum(ct)); + bool ret; + + ret = nf_generic_should_process(nf_ct_protonum(ct)); + if (!ret) + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return ret; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) diff --git a/kernel/net/netfilter/nf_conntrack_proto_gre.c b/kernel/net/netfilter/nf_conntrack_proto_gre.c index 7648674f2..a96451a7a 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_gre.c +++ b/kernel/net/netfilter/nf_conntrack_proto_gre.c @@ -190,9 +190,8 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { - struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); const struct gre_hdr_pptp *pgrehdr; struct gre_hdr_pptp _pgrehdr; __be16 srckey; diff --git a/kernel/net/netfilter/nf_conntrack_proto_sctp.c b/kernel/net/netfilter/nf_conntrack_proto_sctp.c index b45da90fa..9578a7c37 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_sctp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_sctp.c @@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = { "SHUTDOWN_SENT", "SHUTDOWN_RECD", "SHUTDOWN_ACK_SENT", + "HEARTBEAT_SENT", + "HEARTBEAT_ACKED", }; #define SECS * HZ @@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, + [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define sNO SCTP_CONNTRACK_NONE @@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT +#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. +HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to + that of the HEARTBEAT chunk. Secondary connection is + established. */ /* TODO @@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - - Multi Homing support. + - Full Multi Homing support. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} } }; @@ -142,7 +156,7 @@ static inline struct sctp_net *sctp_pernet(struct net *net) } static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct sctphdr *hp; struct sctphdr _hdr; @@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; + case SCTP_CID_HEARTBEAT: + pr_debug("SCTP_CID_HEARTBEAT"); + i = 9; + break; + case SCTP_CID_HEARTBEAT_ACK: + pr_debug("SCTP_CID_HEARTBEAT_ACK"); + i = 10; + break; default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ + /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; @@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; @@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_HEARTBEAT_ACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", + sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out_unlock; + } } old_state = ct->proto.sctp.state; @@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Sec 8.5.1 (A) */ return false; } + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ @@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ @@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; @@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; + pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT]; + pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED]; #endif return 0; } diff --git a/kernel/net/netfilter/nf_conntrack_proto_tcp.c b/kernel/net/netfilter/nf_conntrack_proto_tcp.c index 70383de72..278f3b935 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_tcp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_tcp.c @@ -277,7 +277,7 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net) } static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct tcphdr *hp; struct tcphdr _hdr; diff --git a/kernel/net/netfilter/nf_conntrack_proto_udp.c b/kernel/net/netfilter/nf_conntrack_proto_udp.c index 6957281ff..478f92f83 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_udp.c +++ b/kernel/net/netfilter/nf_conntrack_proto_udp.c @@ -38,6 +38,7 @@ static inline struct nf_udp_net *udp_pernet(struct net *net) static bool udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/kernel/net/netfilter/nf_conntrack_proto_udplite.c b/kernel/net/netfilter/nf_conntrack_proto_udplite.c index c5903d164..1ac8ee13a 100644 --- a/kernel/net/netfilter/nf_conntrack_proto_udplite.c +++ b/kernel/net/netfilter/nf_conntrack_proto_udplite.c @@ -48,6 +48,7 @@ static inline struct udplite_net *udplite_pernet(struct net *net) static bool udplite_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct udphdr *hp; diff --git a/kernel/net/netfilter/nf_conntrack_seqadj.c b/kernel/net/netfilter/nf_conntrack_seqadj.c index ce3e840c8..dff0f0cc5 100644 --- a/kernel/net/netfilter/nf_conntrack_seqadj.c +++ b/kernel/net/netfilter/nf_conntrack_seqadj.c @@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb, ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); + sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); + sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); @@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb, newseq = htonl(ntohl(tcph->seq) + seqoff); newack = htonl(ntohl(tcph->ack_seq) - ackoff); - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, + false); pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), diff --git a/kernel/net/netfilter/nf_conntrack_standalone.c b/kernel/net/netfilter/nf_conntrack_standalone.c index fc823fa5d..1fb3cacc0 100644 --- a/kernel/net/netfilter/nf_conntrack_standalone.c +++ b/kernel/net/netfilter/nf_conntrack_standalone.c @@ -140,6 +140,35 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) +{ + const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + + if (zone->dir != dir) + return; + switch (zone->dir) { + case NF_CT_DEFAULT_ZONE_DIR: + seq_printf(s, "zone=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_ORIG: + seq_printf(s, "zone-orig=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_REPL: + seq_printf(s, "zone-reply=%u ", zone->id); + break; + default: + break; + } +} +#else +static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) +{ +} +#endif + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { @@ -202,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); + if (seq_has_overflowed(s)) goto release; @@ -214,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; @@ -228,11 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v) #endif ct_show_secctx(s, ct); - -#ifdef CONFIG_NF_CONNTRACK_ZONES - seq_printf(s, "zone=%u ", nf_ct_zone(ct)); -#endif - + ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); diff --git a/kernel/net/netfilter/nf_internals.h b/kernel/net/netfilter/nf_internals.h index ea7f36784..065522564 100644 --- a/kernel/net/netfilter/nf_internals.h +++ b/kernel/net/netfilter/nf_internals.h @@ -19,6 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/kernel/net/netfilter/nf_log.c b/kernel/net/netfilter/nf_log.c index 675d12c69..a5d41dfa9 100644 --- a/kernel/net/netfilter/nf_log.c +++ b/kernel/net/netfilter/nf_log.c @@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); diff --git a/kernel/net/netfilter/nf_nat_core.c b/kernel/net/netfilter/nf_nat_core.c index 4e0b47831..06a9f4577 100644 --- a/kernel/net/netfilter/nf_nat_core.c +++ b/kernel/net/netfilter/nf_nat_core.c @@ -83,7 +83,7 @@ out: rcu_read_unlock(); } -int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) +int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct flowi fl; unsigned int hh_len; @@ -99,7 +99,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -118,14 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_conntrack_hash_rnd); return reciprocal_scale(hash, net->ct.nat_htable_size); } @@ -185,20 +184,22 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(struct net *net, u16 zone, +find_appropriate_src(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, zone, tuple); + unsigned int h = hash_by_src(net, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + if (same_src(ct, tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -218,7 +219,8 @@ find_appropriate_src(struct net *net, u16 zone, * the ip with the lowest src-ip/dst-ip/proto usage. */ static void -find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, +find_best_ips_proto(const struct nf_conntrack_zone *zone, + struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) @@ -258,7 +260,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? - 0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { @@ -297,10 +299,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + const struct nf_conntrack_zone *zone; const struct nf_nat_l3proto *l3proto; const struct nf_nat_l4proto *l4proto; struct net *net = nf_ct_net(ct); - u16 zone = nf_ct_zone(ct); + + zone = nf_ct_zone(ct); rcu_read_lock(); l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); @@ -420,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct, if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; - srchash = hash_by_src(net, nf_ct_zone(ct), + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate extension aera */ diff --git a/kernel/net/netfilter/nf_nat_proto_dccp.c b/kernel/net/netfilter/nf_nat_proto_dccp.c index b8067b53f..15c47b246 100644 --- a/kernel/net/netfilter/nf_nat_proto_dccp.c +++ b/kernel/net/netfilter/nf_nat_proto_dccp.c @@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - 0); + false); return true; } diff --git a/kernel/net/netfilter/nf_nat_proto_tcp.c b/kernel/net/netfilter/nf_nat_proto_tcp.c index 37f5505f4..4f8820fc5 100644 --- a/kernel/net/netfilter/nf_nat_proto_tcp.c +++ b/kernel/net/netfilter/nf_nat_proto_tcp.c @@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb, return true; l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } diff --git a/kernel/net/netfilter/nf_nat_proto_udp.c b/kernel/net/netfilter/nf_nat_proto_udp.c index b0ede2f0d..b1e627227 100644 --- a/kernel/net/netfilter/nf_nat_proto_udp.c +++ b/kernel/net/netfilter/nf_nat_proto_udp.c @@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); + false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } diff --git a/kernel/net/netfilter/nf_nat_proto_udplite.c b/kernel/net/netfilter/nf_nat_proto_udplite.c index 368f14e01..58340c97b 100644 --- a/kernel/net/netfilter/nf_nat_proto_udplite.c +++ b/kernel/net/netfilter/nf_nat_proto_udplite.c @@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb, } l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; diff --git a/kernel/net/netfilter/nf_nat_redirect.c b/kernel/net/netfilter/nf_nat_redirect.c index 97b75f9bf..d43869879 100644 --- a/kernel/net/netfilter/nf_nat_redirect.c +++ b/kernel/net/netfilter/nf_nat_redirect.c @@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev != NULL) { + if (indev && indev->ifa_list) { ifa = indev->ifa_list; newdst = ifa->ifa_local; } diff --git a/kernel/net/netfilter/nf_queue.c b/kernel/net/netfilter/nf_queue.c index 2e88032cd..5baa8e24e 100644 --- a/kernel/net/netfilter/nf_queue.c +++ b/kernel/net/netfilter/nf_queue.c @@ -69,19 +69,14 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(physdev); } #endif - /* Drop reference to owner of hook which queued us. */ - module_put(entry->elem->owner); } EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ -bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) +void nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; - if (!try_module_get(entry->elem->owner)) - return false; - if (state->in) dev_hold(state->in); if (state->out) @@ -100,11 +95,20 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(physdev); } #endif - - return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +{ + const struct nf_queue_handler *qh; + + rcu_read_lock(); + qh = rcu_dereference(queue_handler); + if (qh) + qh->nf_hook_drop(net, ops); + rcu_read_unlock(); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). @@ -120,22 +124,20 @@ int nf_queue(struct sk_buff *skb, const struct nf_queue_handler *qh; /* QUEUE == DROP if no one is waiting, to be safe. */ - rcu_read_lock(); - qh = rcu_dereference(queue_handler); if (!qh) { status = -ESRCH; - goto err_unlock; + goto err; } afinfo = nf_get_afinfo(state->pf); if (!afinfo) - goto err_unlock; + goto err; entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); if (!entry) { status = -ENOMEM; - goto err_unlock; + goto err; } *entry = (struct nf_queue_entry) { @@ -145,16 +147,11 @@ int nf_queue(struct sk_buff *skb, .size = sizeof(*entry) + afinfo->route_key_size, }; - if (!nf_queue_entry_get_refs(entry)) { - status = -ECANCELED; - goto err_unlock; - } + nf_queue_entry_get_refs(entry); skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); - rcu_read_unlock(); - if (status < 0) { nf_queue_entry_release_refs(entry); goto err; @@ -162,8 +159,6 @@ int nf_queue(struct sk_buff *skb, return 0; -err_unlock: - rcu_read_unlock(); err: kfree(entry); return status; @@ -176,19 +171,15 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) const struct nf_afinfo *afinfo; int err; - rcu_read_lock(); - nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) { - elem = list_entry(elem->list.prev, struct nf_hook_ops, list); - verdict = NF_ACCEPT; - } + if (verdict == NF_REPEAT) + verdict = elem->hook(elem->priv, skb, &entry->state); if (verdict == NF_ACCEPT) { afinfo = nf_get_afinfo(entry->state.pf); - if (!afinfo || afinfo->reroute(skb, entry) < 0) + if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0) verdict = NF_DROP; } @@ -196,7 +187,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[entry->state.pf][entry->state.hook], + verdict = nf_iterate(entry->state.hook_list, skb, &entry->state, &elem); } @@ -204,15 +195,13 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) case NF_ACCEPT: case NF_STOP: local_bh_disable(); - entry->state.okfn(entry->state.sk, skb); + entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, elem, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { - if (err == -ECANCELED) - goto next_hook; if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; @@ -224,7 +213,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) default: kfree_skb(skb); } - rcu_read_unlock(); + kfree(entry); } EXPORT_SYMBOL(nf_reinject); diff --git a/kernel/net/netfilter/nf_synproxy_core.c b/kernel/net/netfilter/nf_synproxy_core.c index 52e20c9a4..c8a4a48bc 100644 --- a/kernel/net/netfilter/nf_synproxy_core.c +++ b/kernel/net/netfilter/nf_synproxy_core.c @@ -11,15 +11,18 @@ #include #include #include +#include #include #include #include #include + #include #include #include #include +#include int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -185,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; - u32 *ptr, old; + __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; @@ -213,18 +216,18 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - ptr = (u32 *)&op[2]; + ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { - ptr = (u32 *)&op[6]; + ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); } inet_proto_csum_replace4(&th->check, skb, - old, *ptr, 0); + old, *ptr, false); return 1; } optoff += op[1]; @@ -348,23 +351,20 @@ static void __net_exit synproxy_proc_exit(struct net *net) static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - struct nf_conntrack_tuple t; struct nf_conn *ct; int err = -ENOMEM; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); + ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL); + if (!ct) goto err1; - } if (!nfct_seqadj_ext_add(ct)) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - nf_conntrack_tmpl_insert(net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net) err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff --git a/kernel/net/netfilter/nf_tables_api.c b/kernel/net/netfilter/nf_tables_api.c index 34ded0931..2cb429d34 100644 --- a/kernel/net/netfilter/nf_tables_api.c +++ b/kernel/net/netfilter/nf_tables_api.c @@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) } static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, struct nft_af_info *afi, @@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, struct nft_chain *chain, const struct nlattr * const *nla) { - ctx->net = sock_net(skb->sk); + ctx->net = net; ctx->afi = afi; ctx->table = table; ctx->chain = chain; @@ -127,13 +128,50 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + struct net *net = read_pnet(&basechain->pnet); + + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return 0; + + return nf_register_net_hooks(net, basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_register_basechain); + +void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + struct net *net = read_pnet(&basechain->pnet); + + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return; + + nf_unregister_net_hooks(net, basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_unregister_basechain); + +static int nf_tables_register_hooks(const struct nft_table *table, + struct nft_chain *chain, + unsigned int hook_nops) +{ + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return 0; + + return nft_register_basechain(nft_base_chain(chain), hook_nops); +} + static void nf_tables_unregister_hooks(const struct nft_table *table, - const struct nft_chain *chain, + struct nft_chain *chain, unsigned int hook_nops) { - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return; + + nft_unregister_basechain(nft_base_chain(chain), hook_nops); } /* Internal table flags */ @@ -560,7 +598,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + err = nft_register_basechain(nft_base_chain(chain), afi->nops); if (err < 0) goto err; @@ -575,20 +613,20 @@ err: if (i-- <= 0) break; - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + nft_unregister_basechain(nft_base_chain(chain), afi->nops); } return err; } static void nf_tables_table_disable(const struct nft_af_info *afi, - struct nft_table *table) + struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); + nft_unregister_basechain(nft_base_chain(chain), + afi->nops); } } @@ -635,15 +673,14 @@ err: return ret; } -static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; @@ -669,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -679,30 +716,32 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } + err = -EAFNOSUPPORT; if (!try_module_get(afi->owner)) - return -EAFNOSUPPORT; + goto err1; err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL); if (table == NULL) - goto err1; + goto err2; nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&table->list, &afi->tables); return 0; -err2: +err3: kfree(table); -err1: +err2: module_put(afi->owner); +err1: return err; } @@ -771,18 +810,17 @@ out: return err; } -static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_deltable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) return nft_flush(&ctx, family); @@ -881,6 +919,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) @@ -954,6 +994,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; + if (basechain->dev_name[0] && + nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) + goto nla_put_failure; nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1165,16 +1208,20 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); + struct nft_base_chain *basechain = nft_base_chain(chain); + + module_put(basechain->type->owner); + free_percpu(basechain->stats); + if (basechain->ops[0].dev != NULL) + dev_put(basechain->ops[0].dev); + kfree(basechain); } else { kfree(chain); } } -static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1184,8 +1231,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct net_device *dev = NULL; u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; @@ -1264,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(stats); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); if (trans == NULL) { @@ -1325,17 +1372,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOENT; hookfn = type->hooks[hooknum]; + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) { + module_put(type->owner); + return -ENOENT; + } + } else if (ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) { module_put(type->owner); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } + if (dev != NULL) + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return PTR_ERR(stats); } basechain->stats = stats; @@ -1344,6 +1417,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (stats == NULL) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } rcu_assign_pointer(basechain->stats, stats); @@ -1356,11 +1431,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, for (i = 0; i < afi->nops; i++) { ops = &basechain->ops[i]; ops->pf = family; - ops->owner = afi->owner; ops->hooknum = hooknum; ops->priority = priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; + ops->dev = dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) @@ -1380,14 +1455,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) - goto err1; - } + err = nf_tables_register_hooks(table, chain, afi->nops); + if (err < 0) + goto err1; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1402,15 +1474,14 @@ err1: return err; } -static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; @@ -1432,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, if (chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); return nft_delchain(&ctx); } @@ -1936,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static struct nft_expr_info *info; -static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2001,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); n = 0; size = 0; @@ -2102,13 +2172,12 @@ err1: return err; } -static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; @@ -2131,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(chain); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2270,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; struct nft_table *table = NULL; @@ -2297,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, return -ENOENT; } - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -2549,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2556,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2619,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -2724,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) { @@ -2808,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set nft_set_destroy(set); } -static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2822,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2950,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2959,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); if (IS_ERR(afi)) @@ -2971,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, if (!trans && (table->flags & NFT_TABLE_INACTIVE)) return -ENOENT; - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -3061,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3076,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, - false); + err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, + (void *)nla, false); if (err < 0) return err; @@ -3138,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3454,11 +3523,10 @@ err1: return err; } -static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3467,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true); if (err < 0) return err; @@ -3549,8 +3617,8 @@ err1: return err; } -static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nlattr *attr; @@ -3561,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3956,7 +4024,8 @@ static int nf_tables_abort(struct sk_buff *skb) struct nft_trans *trans, *next; struct nft_trans_elem *te; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { diff --git a/kernel/net/netfilter/nf_tables_core.c b/kernel/net/netfilter/nf_tables_core.c index f153b0707..f3695a497 100644 --- a/kernel/net/netfilter/nf_tables_core.c +++ b/kernel/net/netfilter/nf_tables_core.c @@ -48,9 +48,7 @@ static void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_chain *chain, int rulenum, enum nft_trace type) { - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - - nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); @@ -111,10 +109,10 @@ struct nft_jumpstack { }; unsigned int -nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) +nft_do_chain(struct nft_pktinfo *pkt, void *priv) { - const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet); + const struct nft_chain *chain = priv, *basechain = chain; + const struct net *net = pkt->net; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; diff --git a/kernel/net/netfilter/nf_tables_netdev.c b/kernel/net/netfilter/nf_tables_netdev.c new file mode 100644 index 000000000..edb3502f2 --- /dev/null +++ b/kernel/net/netfilter/nf_tables_netdev.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void +nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct iphdr *iph, _iph; + u32 len, thoff; + + nft_set_pktinfo(pkt, skb, state); + + iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), + &_iph); + if (!iph) + return; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) + return; + else if (len < thoff) + return; + + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; +} + +static inline void +__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + unsigned int thoff = 0; + unsigned short frag_off; + int protohdr; + u32 pkt_len; + + ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), + &_ip6h); + if (!ip6h) + return; + + if (ip6h->version != 6) + return; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) + return; + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + if (protohdr < 0) + return; + + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; +#endif +} + +static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, skb, state); + __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); +} + +static unsigned int +nft_do_chain_netdev(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); + break; + case htons(ETH_P_IPV6): + nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); + break; + default: + nft_set_pktinfo(&pkt, skb, state); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static struct nft_af_info nft_af_netdev __read_mostly = { + .family = NFPROTO_NETDEV, + .nhooks = NF_NETDEV_NUMHOOKS, + .owner = THIS_MODULE, + .flags = NFT_AF_NEEDS_DEV, + .nops = 1, + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static int nf_tables_netdev_init_net(struct net *net) +{ + net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.netdev == NULL) + return -ENOMEM; + + memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); + + if (nft_register_afinfo(net, net->nft.netdev) < 0) + goto err; + + return 0; +err: + kfree(net->nft.netdev); + return -ENOMEM; +} + +static void nf_tables_netdev_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.netdev); + kfree(net->nft.netdev); +} + +static struct pernet_operations nf_tables_netdev_net_ops = { + .init = nf_tables_netdev_init_net, + .exit = nf_tables_netdev_exit_net, +}; + +static const struct nf_chain_type nft_filter_chain_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_NETDEV_INGRESS), +}; + +static void nft_netdev_event(unsigned long event, struct nft_af_info *afi, + struct net_device *dev, struct nft_table *table, + struct nft_base_chain *basechain) +{ + switch (event) { + case NETDEV_REGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED)); + + dev_hold(dev); + basechain->ops[0].dev = dev; + basechain->flags &= ~NFT_BASECHAIN_DISABLED; + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_register_basechain(basechain, afi->nops); + break; + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED); + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_unregister_basechain(basechain, afi->nops); + + dev_put(basechain->ops[0].dev); + basechain->ops[0].dev = NULL; + basechain->flags |= NFT_BASECHAIN_DISABLED; + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops[0].dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + if (afi->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nft_netdev_event(event, afi, dev, table, + nft_base_chain(chain)); + } + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + +static int __init nf_tables_netdev_init(void) +{ + int ret; + + nft_register_chain_type(&nft_filter_chain_netdev); + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); + if (ret < 0) + nft_unregister_chain_type(&nft_filter_chain_netdev); + + register_netdevice_notifier(&nf_tables_netdev_notifier); + + return ret; +} + +static void __exit nf_tables_netdev_exit(void) +{ + unregister_netdevice_notifier(&nf_tables_netdev_notifier); + unregister_pernet_subsys(&nf_tables_netdev_net_ops); + nft_unregister_chain_type(&nft_filter_chain_netdev); +} + +module_init(nf_tables_netdev_init); +module_exit(nf_tables_netdev_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ diff --git a/kernel/net/netfilter/nfnetlink.c b/kernel/net/netfilter/nfnetlink.c index 8b117c90e..77afe913d 100644 --- a/kernel/net/netfilter/nfnetlink.c +++ b/kernel/net/netfilter/nfnetlink.c @@ -64,7 +64,7 @@ void nfnl_unlock(__u8 subsys_id) EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING -int lockdep_nfnl_is_held(u8 subsys_id) +bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } @@ -269,6 +269,12 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) } } +enum { + NFNL_BATCH_FAILURE = (1 << 0), + NFNL_BATCH_DONE = (1 << 1), + NFNL_BATCH_REPLAY = (1 << 2), +}; + static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u_int16_t subsys_id) { @@ -276,19 +282,19 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; - bool success = true, done = false; static LIST_HEAD(err_list); + u32 status; int err; if (subsys_id >= NFNL_SUBSYS_COUNT) return netlink_ack(skb, nlh, -EINVAL); replay: + status = 0; + skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM); - skb->sk = oskb->sk; - nfnl_lock(subsys_id); ss = rcu_dereference_protected(table[subsys_id].subsys, lockdep_is_held(&table[subsys_id].mutex)); @@ -336,10 +342,10 @@ replay: if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ nfnl_err_reset(&err_list); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } else if (type == NFNL_MSG_BATCH_END) { - done = true; + status |= NFNL_BATCH_DONE; goto done; } else if (type < NLMSG_MIN_TYPE) { err = -EINVAL; @@ -373,7 +379,7 @@ replay: goto ack; if (nc->call_batch) { - err = nc->call_batch(net->nfnl, skb, nlh, + err = nc->call_batch(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); } @@ -382,11 +388,8 @@ replay: * original skb. */ if (err == -EAGAIN) { - nfnl_err_reset(&err_list); - ss->abort(oskb); - nfnl_unlock(subsys_id); - kfree_skb(skb); - goto replay; + status |= NFNL_BATCH_REPLAY; + goto next; } } ack: @@ -402,7 +405,7 @@ ack: */ nfnl_err_reset(&err_list); netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } /* We don't stop processing the batch on errors, thus, @@ -410,19 +413,26 @@ ack: * triggers. */ if (err) - success = false; + status |= NFNL_BATCH_FAILURE; } - +next: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } done: - if (success && done) + if (status & NFNL_BATCH_REPLAY) { + ss->abort(oskb); + nfnl_err_reset(&err_list); + nfnl_unlock(subsys_id); + kfree_skb(skb); + goto replay; + } else if (status == NFNL_BATCH_DONE) { ss->commit(oskb); - else + } else { ss->abort(oskb); + } nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); @@ -432,6 +442,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); + u_int16_t res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || @@ -456,7 +467,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); - nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + nfnetlink_rcv_batch(skb, nlh, res_id); } else { netlink_rcv_skb(skb, &nfnetlink_rcv_msg); } @@ -474,7 +490,7 @@ static int nfnetlink_bind(struct net *net, int group) type = nfnl_group2type[group]; rcu_read_lock(); - ss = nfnetlink_get_subsys(type); + ss = nfnetlink_get_subsys(type << 8); rcu_read_unlock(); if (!ss) request_module("nfnetlink-subsys-%d", type); diff --git a/kernel/net/netfilter/nfnetlink_acct.c b/kernel/net/netfilter/nfnetlink_acct.c index c18af2f63..fefbf5f0b 100644 --- a/kernel/net/netfilter/nfnetlink_acct.c +++ b/kernel/net/netfilter/nfnetlink_acct.c @@ -27,8 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); -static LIST_HEAD(nfnl_acct_list); - struct nf_acct { atomic64_t pkts; atomic64_t bytes; @@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; + struct net *net = sock_net(nfnl); char *acct_name; unsigned int size = 0; u32 flags = 0; @@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &nfnl_acct_list, head) { + list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } atomic_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); return 0; } @@ -185,6 +184,7 @@ nla_put_failure: static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -257,6 +257,7 @@ static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -336,19 +337,20 @@ static int nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); char *acct_name; struct nf_acct *cur; int ret = -ENOENT; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &nfnl_acct_list, head) + list_for_each_entry(cur, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = { MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); -struct nf_acct *nfnl_acct_find_get(const char *acct_name) +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { - atomic_dec(&acct->refcnt); + if (atomic_dec_and_test(&acct->refcnt)) + kfree_rcu(acct, rcu_head); + module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); @@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); +static int __net_init nfnl_acct_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->nfnl_acct_list); + + return 0; +} + +static void __net_exit nfnl_acct_net_exit(struct net *net) +{ + struct nf_acct *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_del_rcu(&cur->head); + + if (atomic_dec_and_test(&cur->refcnt)) + kfree_rcu(cur, rcu_head); + } +} + +static struct pernet_operations nfnl_acct_ops = { + .init = nfnl_acct_net_init, + .exit = nfnl_acct_net_exit, +}; + static int __init nfnl_acct_init(void) { int ret; + ret = register_pernet_subsys(&nfnl_acct_ops); + if (ret < 0) { + pr_err("nfnl_acct_init: failed to register pernet ops\n"); + goto err_out; + } + pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); - goto err_out; + goto cleanup_pernet; } return 0; + +cleanup_pernet: + unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { - struct nf_acct *cur, *tmp; - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); - - list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { - list_del_rcu(&cur->head); - /* We are sure that our objects have no clients at this point, - * it's safe to release them all without checking refcnt. */ - kfree_rcu(cur, rcu_head); - } + unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); diff --git a/kernel/net/netfilter/nfnetlink_cttimeout.c b/kernel/net/netfilter/nfnetlink_cttimeout.c index 476accd17..c7a2d0e1c 100644 --- a/kernel/net/netfilter/nfnetlink_cttimeout.c +++ b/kernel/net/netfilter/nfnetlink_cttimeout.c @@ -291,6 +291,34 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, return ret; } +static void untimeout(struct nf_conntrack_tuple_hash *i, + struct ctnl_timeout *timeout) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); +} + +static void ctnl_untimeout(struct ctnl_timeout *timeout) +{ + struct nf_conntrack_tuple_hash *h; + const struct hlist_nulls_node *nn; + int i; + + local_bh_disable(); + for (i = 0; i < init_net.ct.htable_size; i++) { + spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + if (i < init_net.ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode) + untimeout(h, timeout); + } + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + } + local_bh_enable(); +} + /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) { @@ -301,6 +329,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); + ctnl_untimeout(timeout); kfree_rcu(timeout, rcu_head); } else { /* still in use, restore reference counter. */ @@ -567,6 +596,10 @@ static void __exit cttimeout_exit(void) pr_info("cttimeout: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&cttimeout_subsys); + + /* Make sure no conntrack objects refer to custom timeouts anymore. */ + ctnl_untimeout(NULL); + list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { list_del_rcu(&cur->head); /* We are sure that our objects have no clients at this point, @@ -579,6 +612,7 @@ static void __exit cttimeout_exit(void) RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + rcu_barrier(); } module_init(cttimeout_init); diff --git a/kernel/net/netfilter/nfnetlink_log.c b/kernel/net/netfilter/nfnetlink_log.c index 4ef1fae84..740cce468 100644 --- a/kernel/net/netfilter/nfnetlink_log.c +++ b/kernel/net/netfilter/nfnetlink_log.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -401,7 +402,9 @@ __build_packet_message(struct nfnl_log_net *log, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, - const char *prefix, unsigned int plen) + const char *prefix, unsigned int plen, + const struct nfnl_ct_hook *nfnl_ct, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; @@ -538,9 +541,9 @@ __build_packet_message(struct nfnl_log_net *log, if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(skb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; @@ -575,6 +578,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(atomic_inc_return(&log->global_seq)))) goto nla_put_failure; + if (ct && nfnl_ct->build(inst->skb, ct, ctinfo, + NFULA_CT, NFULA_CT_INFO) < 0) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); @@ -598,8 +605,6 @@ nla_put_failure: return -1; } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { @@ -622,12 +627,16 @@ nfulnl_log_packet(struct net *net, const struct nf_loginfo *li_user, const char *prefix) { - unsigned int size, data_len; + size_t size; + unsigned int data_len; struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; struct nfnl_log_net *log = nfnl_log_pernet(net); + const struct nfnl_ct_hook *nfnl_ct = NULL; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; @@ -673,6 +682,14 @@ nfulnl_log_packet(struct net *net, size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); + if (inst->flags & NFULNL_CFG_F_CONNTRACK) { + nfnl_ct = rcu_dereference(nfnl_ct_hook); + if (nfnl_ct != NULL) { + ct = nfnl_ct->get_ct(skb, &ctinfo); + if (ct != NULL) + size += nfnl_ct->build_size(ct); + } + } qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ @@ -717,7 +734,8 @@ nfulnl_log_packet(struct net *net, inst->qlen++; __build_packet_message(log, inst, skb, data_len, pf, - hooknum, in, out, prefix, plen); + hooknum, in, out, prefix, plen, + nfnl_ct, ct, ctinfo); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); @@ -807,6 +825,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct net *net = sock_net(ctnl); struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; + u16 flags = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; @@ -828,6 +847,28 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; } + /* Check if we support these flags in first place, dependencies should + * be there too not to break atomicity. + */ + if (nfula[NFULA_CFG_FLAGS]) { + flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS])); + + if ((flags & NFULNL_CFG_F_CONNTRACK) && + !rcu_access_pointer(nfnl_ct_hook)) { +#ifdef CONFIG_MODULES + nfnl_unlock(NFNL_SUBSYS_ULOG); + request_module("ip_conntrack_netlink"); + nfnl_lock(NFNL_SUBSYS_ULOG); + if (rcu_access_pointer(nfnl_ct_hook)) { + ret = -EAGAIN; + goto out_put; + } +#endif + ret = -EOPNOTSUPP; + goto out_put; + } + } + if (cmd != NULL) { switch (cmd->command) { case NFULNL_CFG_CMD_BIND: @@ -856,16 +897,15 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -ENOTSUPP; break; } + } else if (!inst) { + ret = -ENODEV; + goto out; } if (nfula[NFULA_CFG_MODE]) { - struct nfulnl_msg_config_mode *params; - params = nla_data(nfula[NFULA_CFG_MODE]); + struct nfulnl_msg_config_mode *params = + nla_data(nfula[NFULA_CFG_MODE]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } @@ -873,42 +913,23 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, if (nfula[NFULA_CFG_TIMEOUT]) { __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); - if (!inst) { - ret = -ENODEV; - goto out; - } nfulnl_set_qthresh(inst, ntohl(qthresh)); } - if (nfula[NFULA_CFG_FLAGS]) { - __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); - - if (!inst) { - ret = -ENODEV; - goto out; - } - nfulnl_set_flags(inst, ntohs(flags)); - } + if (nfula[NFULA_CFG_FLAGS]) + nfulnl_set_flags(inst, flags); out_put: instance_put(inst); diff --git a/kernel/net/netfilter/nfnetlink_queue.c b/kernel/net/netfilter/nfnetlink_queue.c new file mode 100644 index 000000000..861c66152 --- /dev/null +++ b/kernel/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,1444 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte + * (C) 2007 by Patrick McHardy + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that. Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + struct rcu_head rcu; + + u32 peer_portid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + unsigned int id_sequence; /* 'sequence' of pkt ids */ + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static int nfnl_queue_net_id __read_mostly; + +#define INSTANCE_BUCKETS 16 +struct nfnl_queue_net { + spinlock_t instances_lock; + struct hlist_head instance_table[INSTANCE_BUCKETS]; +}; + +static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) +{ + return net_generic(net, nfnl_queue_net_id); +} + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) +{ + struct hlist_head *head; + struct nfqnl_instance *inst; + + head = &q->instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) +{ + struct nfqnl_instance *inst; + unsigned int h; + int err; + + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + inst->queue_num = queue_num; + inst->peer_portid = portid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = NFQNL_MAX_COPY_RANGE; + inst->copy_mode = NFQNL_COPY_NONE; + spin_lock_init(&inst->lock); + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; + goto out_free; + } + + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); + + spin_unlock(&q->instances_lock); + + return inst; + +out_free: + kfree(inst); +out_unlock: + spin_unlock(&q->instances_lock); + return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + + nfqnl_flush(inst, NULL, 0); + kfree(inst); + module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) +{ + spin_lock(&q->instances_lock); + __instance_destroy(inst); + spin_unlock(&q->instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +static void +__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_del(&entry->list); + queue->queue_total--; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue->lock); + + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } + } + + if (entry) + __dequeue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + spin_lock_bh(&queue->lock); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } + spin_unlock_bh(&queue->lock); +} + +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) +{ + __u32 flags = 0; + + if (packet->ip_summed == CHECKSUM_PARTIAL) + flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + + if (skb_is_gso(packet)) + flags |= NFQA_SKB_GSO; + + return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; +} + +static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) +{ + const struct cred *cred; + + if (!sk_fullsock(sk)) + return 0; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + cred = sk->sk_socket->file->f_cred; + if (nla_put_be32(skb, NFQA_UID, + htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) + goto nla_put_failure; + if (nla_put_be32(skb, NFQA_GID, + htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) + goto nla_put_failure; + } + read_unlock_bh(&sk->sk_callback_lock); + return 0; + +nla_put_failure: + read_unlock_bh(&sk->sk_callback_lock); + return -1; +} + +static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +{ + u32 seclen = 0; +#if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) + return 0; + + read_lock_bh(&skb->sk->sk_callback_lock); + + if (skb->secmark) + security_secid_to_secctx(skb->secmark, secdata, &seclen); + + read_unlock_bh(&skb->sk->sk_callback_lock); +#endif + return seclen; +} + +static struct sk_buff * +nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry, + __be32 **packet_id_ptr) +{ + size_t size; + size_t data_len = 0, cap_len = 0, rem_len = 0; + unsigned int hlen = 0; + struct sk_buff *skb; + struct nlattr *nla; + struct nfqnl_msg_packet_hdr *pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; + struct nf_conn *ct = NULL; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nfnl_ct_hook *nfnl_ct; + bool csum_verify; + char *secdata = NULL; + u32 seclen = 0; + + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + + nla_total_size(sizeof(u_int32_t)); /* cap_len */ + + if (entskb->tstamp.tv64) + size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + if (entry->state.hook <= NF_INET_FORWARD || + (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; + + outdev = entry->state.out; + + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + break; + + case NFQNL_COPY_PACKET: + if (!(queue->flags & NFQA_CFG_F_GSO) && + entskb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(entskb)) + return NULL; + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len > entskb->len) + data_len = entskb->len; + + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); + size += sizeof(struct nlattr) + hlen; + cap_len = entskb->len; + rem_len = data_len - hlen; + break; + } + + nfnl_ct = rcu_dereference(nfnl_ct_hook); + + if (queue->flags & NFQA_CFG_F_CONNTRACK) { + if (nfnl_ct != NULL) { + ct = nfnl_ct->get_ct(entskb, &ctinfo); + if (ct != NULL) + size += nfnl_ct->build_size(ct); + } + } + + if (queue->flags & NFQA_CFG_F_UID_GID) { + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t))); /* gid */ + } + + if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { + seclen = nfqnl_get_sk_secctx(entskb, &secdata); + if (seclen) + size += nla_total_size(seclen); + } + + skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, + GFP_ATOMIC); + if (!skb) { + skb_tx_error(entskb); + return NULL; + } + + nlh = nlmsg_put(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg), 0); + if (!nlh) { + skb_tx_error(entskb); + kfree_skb(skb); + return NULL; + } + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = entry->state.pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); + pmsg = nla_data(nla); + pmsg->hw_protocol = entskb->protocol; + pmsg->hook = entry->state.hook; + *packet_id_ptr = &pmsg->packet_id; + + indev = entry->state.in; + if (indev) { +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) + goto nla_put_failure; +#else + if (entry->state.pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + int physinif; + + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex))) + goto nla_put_failure; + + physinif = nf_bridge_get_physinif(entskb); + if (physinif && + nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(physinif))) + goto nla_put_failure; + } +#endif + } + + if (outdev) { +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) + goto nla_put_failure; +#else + if (entry->state.pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)) || + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) + goto nla_put_failure; + } else { + int physoutif; + + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex))) + goto nla_put_failure; + + physoutif = nf_bridge_get_physoutif(entskb); + if (physoutif && + nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(physoutif))) + goto nla_put_failure; + } +#endif + } + + if (entskb->mark && + nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) + goto nla_put_failure; + + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { + struct nfqnl_msg_packet_hw phw; + int len; + + memset(&phw, 0, sizeof(phw)); + len = dev_parse_header(entskb, phw.hw_addr); + if (len) { + phw.hw_addrlen = htons(len); + if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) + goto nla_put_failure; + } + } + + if (entskb->tstamp.tv64) { + struct nfqnl_msg_packet_timestamp ts; + struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + + ts.sec = cpu_to_be64(kts.tv_sec); + ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); + + if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) + goto nla_put_failure; + } + + if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && + nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) + goto nla_put_failure; + + if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + goto nla_put_failure; + + if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) + goto nla_put_failure; + + if (cap_len > data_len && + nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + goto nla_put_failure; + + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) + goto nla_put_failure; + + if (data_len) { + struct nlattr *nla; + + if (skb_tailroom(skb) < sizeof(*nla) + hlen) + goto nla_put_failure; + + nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); + nla->nla_type = NFQA_PAYLOAD; + nla->nla_len = nla_attr_size(data_len); + + if (skb_zerocopy(skb, entskb, data_len, hlen)) + goto nla_put_failure; + } + + nlh->nlmsg_len = skb->len; + return skb; + +nla_put_failure: + skb_tx_error(entskb); + kfree_skb(skb); + net_err_ratelimited("nf_queue: error creating packet message\n"); + return NULL; +} + +static int +__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, + struct nf_queue_entry *entry) +{ + struct sk_buff *nskb; + int err = -ENOBUFS; + __be32 *packet_id_ptr; + int failopen = 0; + + nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); + if (nskb == NULL) { + err = -ENOMEM; + goto err_out; + } + spin_lock_bh(&queue->lock); + + if (queue->queue_total >= queue->queue_maxlen) { + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + } + goto err_out_free_nskb; + } + entry->id = ++queue->id_sequence; + *packet_id_ptr = htonl(entry->id); + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); + if (err < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return 0; + +err_out_free_nskb: + kfree_skb(nskb); +err_out_unlock: + spin_unlock_bh(&queue->lock); + if (failopen) + nf_reinject(entry, NF_ACCEPT); +err_out: + return err; +} + +static struct nf_queue_entry * +nf_queue_entry_dup(struct nf_queue_entry *e) +{ + struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); + if (entry) + nf_queue_entry_get_refs(entry); + return entry; +} + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +static void free_entry(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} + +static int +__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, + struct sk_buff *skb, struct nf_queue_entry *entry) +{ + int ret = -ENOMEM; + struct nf_queue_entry *entry_seg; + + nf_bridge_adjust_segmented_data(skb); + + if (skb->next == NULL) { /* last packet, no need to copy entry */ + struct sk_buff *gso_skb = entry->skb; + entry->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry); + if (ret) + entry->skb = gso_skb; + return ret; + } + + skb->next = NULL; + + entry_seg = nf_queue_entry_dup(entry); + if (entry_seg) { + entry_seg->skb = skb; + ret = __nfqnl_enqueue_packet(net, queue, entry_seg); + if (ret) + free_entry(entry_seg); + } + return ret; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + unsigned int queued; + struct nfqnl_instance *queue; + struct sk_buff *skb, *segs; + int err = -ENOBUFS; + struct net *net = entry->state.net; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(q, queuenum); + if (!queue) + return -ESRCH; + + if (queue->copy_mode == NFQNL_COPY_NONE) + return -EINVAL; + + skb = entry->skb; + + switch (entry->state.pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + return __nfqnl_enqueue_packet(net, queue, entry); + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ESRCH to + * mean 'ignore this hook'. + */ + if (IS_ERR_OR_NULL(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + if (err == 0) + err = __nfqnl_enqueue_packet_gso(net, queue, + segs, entry); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + if (err) /* some segments are already queued */ + free_entry(entry); + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +{ + struct sk_buff *nskb; + + if (diff < 0) { + if (pskb_trim(e->skb, data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "nf_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, data, data_len); + e->skb->ip_summed = CHECKSUM_NONE; + return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + spin_lock_bh(&queue->lock); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + if (range == 0 || range > NFQNL_MAX_COPY_RANGE) + queue->copy_range = NFQNL_MAX_COPY_RANGE; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->state.in) + if (entry->state.in->ifindex == ifindex) + return 1; + if (entry->state.out) + if (entry->state.out->ifindex == ifindex) + return 1; +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->skb->nf_bridge) { + int physinif, physoutif; + + physinif = nf_bridge_get_physinif(entry->skb); + physoutif = nf_bridge_get_physoutif(entry->skb); + + if (physinif == ifindex || physoutif == ifindex) + return 1; + } +#endif + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(struct net *net, int ifindex) +{ + int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + rcu_read_lock(); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); + } + + rcu_read_unlock(); +} + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev_net(dev), dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +{ + return entry->elem == (struct nf_hook_ops *)ops_ptr; +} + +static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +{ + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int i; + + rcu_read_lock(); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook); + } + rcu_read_unlock(); +} + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this portid */ + spin_lock(&q->instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_safe(inst, t2, head, hlist) { + if (n->portid == inst->peer_portid) + __instance_destroy(inst); + } + } + spin_unlock(&q->instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, + [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, +}; + +static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, +}; + +static struct nfqnl_instance * +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) +{ + struct nfqnl_instance *queue; + + queue = instance_lookup(q, queue_num); + if (!queue) + return ERR_PTR(-ENODEV); + + if (queue->peer_portid != nlportid) + return ERR_PTR(-EPERM); + + return queue; +} + +static struct nfqnl_msg_verdict_hdr* +verdicthdr_get(const struct nlattr * const nfqa[]) +{ + struct nfqnl_msg_verdict_hdr *vhdr; + unsigned int verdict; + + if (!nfqa[NFQA_VERDICT_HDR]) + return NULL; + + vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); + verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; + if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) + return NULL; + return vhdr; +} + +static int nfq_id_after(unsigned int id, unsigned int max) +{ + return (int)(id - max) > 0; +} + +static int +nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_queue_entry *entry, *tmp; + unsigned int verdict, maxid; + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + LIST_HEAD(batch_list); + u16 queue_num = ntohs(nfmsg->res_id); + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + maxid = ntohl(vhdr->id); + + spin_lock_bh(&queue->lock); + + list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { + if (nfq_id_after(entry->id, maxid)) + break; + __dequeue_entry(queue, entry); + list_add_tail(&entry->list, &batch_list); + } + + spin_unlock_bh(&queue->lock); + + if (list_empty(&batch_list)) + return -ENOENT; + + list_for_each_entry_safe(entry, tmp, &batch_list, list) { + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + nf_reinject(entry, verdict); + } + return 0; +} + +static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[], + struct nf_queue_entry *entry, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conn *ct; + + ct = nfnl_ct->get_ct(entry->skb, ctinfo); + if (ct == NULL) + return NULL; + + if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) + return NULL; + + if (nfqa[NFQA_EXP]) + nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, + NETLINK_CB(entry->skb).portid, + nlmsg_report(nlh)); + return ct; +} + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nf_queue_entry *entry; + enum ip_conntrack_info uninitialized_var(ctinfo); + struct nfnl_ct_hook *nfnl_ct; + struct nf_conn *ct = NULL; + + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + queue = instance_lookup(q, queue_num); + if (!queue) + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + if (entry == NULL) + return -ENOENT; + + /* rcu lock already held from nfnl->call_rcu. */ + nfnl_ct = rcu_dereference(nfnl_ct_hook); + + if (nfqa[NFQA_CT]) { + if (nfnl_ct != NULL) + ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); + } + + if (nfqa[NFQA_PAYLOAD]) { + u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); + int diff = payload_len - entry->skb->len; + + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), + payload_len, entry, diff) < 0) + verdict = NF_DROP; + + if (ct && diff) + nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); + } + + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + + nf_reinject(entry, verdict); + return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { + [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, + [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { + .outfn = &nfqnl_enqueue_packet, + .nf_hook_drop = &nfqnl_nf_hook_drop, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; + struct net *net = sock_net(ctnl); + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int ret = 0; + + if (nfqa[NFQA_CFG_CMD]) { + cmd = nla_data(nfqa[NFQA_CFG_CMD]); + + /* Obsolete commands without queue context */ + switch (cmd->command) { + case NFQNL_CFG_CMD_PF_BIND: return 0; + case NFQNL_CFG_CMD_PF_UNBIND: return 0; + } + } + + rcu_read_lock(); + queue = instance_lookup(q, queue_num); + if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { + ret = -EPERM; + goto err_out_unlock; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } + queue = instance_create(q, queue_num, + NETLINK_CB(skb).portid); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + instance_destroy(q, queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + case NFQNL_CFG_CMD_PF_UNBIND: + break; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfqa[NFQA_CFG_PARAMS]) { + struct nfqnl_msg_config_params *params; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + params = nla_data(nfqa[NFQA_CFG_PARAMS]); + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { + __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); + spin_lock_bh(&queue->lock); + queue->queue_maxlen = ntohl(*queue_maxlen); + spin_unlock_bh(&queue->lock); + } + + if (nfqa[NFQA_CFG_FLAGS]) { + __u32 flags, mask; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to specify which flags are being + * changed. + */ + ret = -EINVAL; + goto err_out_unlock; + } + + flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); + mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + + if (flags >= NFQA_CFG_F_MAX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (flags & mask & NFQA_CFG_F_SECCTX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#endif + spin_lock_bh(&queue->lock); + queue->flags &= ~mask; + queue->flags |= flags & mask; + spin_unlock_bh(&queue->lock); + } + +err_out_unlock: + rcu_read_unlock(); + return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, }, + [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy }, + [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + struct net *net; + struct nfnl_queue_net *q; + + if (!st) + return NULL; + + net = seq_file_net(seq); + q = nfnl_queue_pernet(net); + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&q->instance_table[st->bucket])) + return q->instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + + h = h->next; + while (!h) { + struct nfnl_queue_net *q; + + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + q = nfnl_queue_pernet(net); + h = q->instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *s, loff_t *pos) + __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); + return get_idx(s, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) +{ + spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", + inst->queue_num, + inst->peer_portid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + inst->id_sequence, 1); + return 0; +} + +static const struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &nfqnl_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif /* PROC_FS */ + +static int __net_init nfnl_queue_net_init(struct net *net) +{ + unsigned int i; + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&q->instance_table[i]); + + spin_lock_init(&q->instances_lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + net->nf.proc_netfilter, &nfqnl_file_ops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit nfnl_queue_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif +} + +static struct pernet_operations nfnl_queue_net_ops = { + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), +}; + +static int __init nfnetlink_queue_init(void) +{ + int status; + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto out; + } + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + pr_err("nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + register_netdevice_notifier(&nfqnl_dev_notifier); + nf_register_queue_handler(&nfqh); + return status; + +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); +out: + return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ + nf_unregister_queue_handler(); + unregister_netdevice_notifier(&nfqnl_dev_notifier); + nfnetlink_subsys_unregister(&nfqnl_subsys); + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/kernel/net/netfilter/nfnetlink_queue_core.c b/kernel/net/netfilter/nfnetlink_queue_core.c deleted file mode 100644 index 11c7682fa..000000000 --- a/kernel/net/netfilter/nfnetlink_queue_core.c +++ /dev/null @@ -1,1362 +0,0 @@ -/* - * This is a module which is used for queueing packets and communicating with - * userspace via nfnetlink. - * - * (C) 2005 by Harald Welte - * (C) 2007 by Patrick McHardy - * - * Based on the old ipv4-only ip_queue.c: - * (C) 2000-2002 James Morris - * (C) 2003-2005 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -#include "../bridge/br_private.h" -#endif - -#define NFQNL_QMAX_DEFAULT 1024 - -/* We're using struct nlattr which has 16bit nla_len. Note that nla_len - * includes the header length. Thus, the maximum packet length that we - * support is 65531 bytes. We send truncated packets if the specified length - * is larger than that. Userspace can check for presence of NFQA_CAP_LEN - * attribute to detect truncation. - */ -#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) - -struct nfqnl_instance { - struct hlist_node hlist; /* global list of queues */ - struct rcu_head rcu; - - u32 peer_portid; - unsigned int queue_maxlen; - unsigned int copy_range; - unsigned int queue_dropped; - unsigned int queue_user_dropped; - - - u_int16_t queue_num; /* number of this queue */ - u_int8_t copy_mode; - u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ -/* - * Following fields are dirtied for each queued packet, - * keep them in same cache line if possible. - */ - spinlock_t lock; - unsigned int queue_total; - unsigned int id_sequence; /* 'sequence' of pkt ids */ - struct list_head queue_list; /* packets in queue */ -}; - -typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); - -static int nfnl_queue_net_id __read_mostly; - -#define INSTANCE_BUCKETS 16 -struct nfnl_queue_net { - spinlock_t instances_lock; - struct hlist_head instance_table[INSTANCE_BUCKETS]; -}; - -static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) -{ - return net_generic(net, nfnl_queue_net_id); -} - -static inline u_int8_t instance_hashfn(u_int16_t queue_num) -{ - return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; -} - -static struct nfqnl_instance * -instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) -{ - struct hlist_head *head; - struct nfqnl_instance *inst; - - head = &q->instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry_rcu(inst, head, hlist) { - if (inst->queue_num == queue_num) - return inst; - } - return NULL; -} - -static struct nfqnl_instance * -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) -{ - struct nfqnl_instance *inst; - unsigned int h; - int err; - - spin_lock(&q->instances_lock); - if (instance_lookup(q, queue_num)) { - err = -EEXIST; - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) { - err = -ENOMEM; - goto out_unlock; - } - - inst->queue_num = queue_num; - inst->peer_portid = portid; - inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = NFQNL_MAX_COPY_RANGE; - inst->copy_mode = NFQNL_COPY_NONE; - spin_lock_init(&inst->lock); - INIT_LIST_HEAD(&inst->queue_list); - - if (!try_module_get(THIS_MODULE)) { - err = -EAGAIN; - goto out_free; - } - - h = instance_hashfn(queue_num); - hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - - spin_unlock(&q->instances_lock); - - return inst; - -out_free: - kfree(inst); -out_unlock: - spin_unlock(&q->instances_lock); - return ERR_PTR(err); -} - -static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, - unsigned long data); - -static void -instance_destroy_rcu(struct rcu_head *head) -{ - struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, - rcu); - - nfqnl_flush(inst, NULL, 0); - kfree(inst); - module_put(THIS_MODULE); -} - -static void -__instance_destroy(struct nfqnl_instance *inst) -{ - hlist_del_rcu(&inst->hlist); - call_rcu(&inst->rcu, instance_destroy_rcu); -} - -static void -instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) -{ - spin_lock(&q->instances_lock); - __instance_destroy(inst); - spin_unlock(&q->instances_lock); -} - -static inline void -__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) -{ - list_add_tail(&entry->list, &queue->queue_list); - queue->queue_total++; -} - -static void -__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) -{ - list_del(&entry->list); - queue->queue_total--; -} - -static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) -{ - struct nf_queue_entry *entry = NULL, *i; - - spin_lock_bh(&queue->lock); - - list_for_each_entry(i, &queue->queue_list, list) { - if (i->id == id) { - entry = i; - break; - } - } - - if (entry) - __dequeue_entry(queue, entry); - - spin_unlock_bh(&queue->lock); - - return entry; -} - -static void -nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nf_queue_entry *entry, *next; - - spin_lock_bh(&queue->lock); - list_for_each_entry_safe(entry, next, &queue->queue_list, list) { - if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue->queue_total--; - nf_reinject(entry, NF_DROP); - } - } - spin_unlock_bh(&queue->lock); -} - -static int -nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, - bool csum_verify) -{ - __u32 flags = 0; - - if (packet->ip_summed == CHECKSUM_PARTIAL) - flags = NFQA_SKB_CSUMNOTREADY; - else if (csum_verify) - flags = NFQA_SKB_CSUM_NOTVERIFIED; - - if (skb_is_gso(packet)) - flags |= NFQA_SKB_GSO; - - return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; -} - -static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) -{ - const struct cred *cred; - - if (!sk_fullsock(sk)) - return 0; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - cred = sk->sk_socket->file->f_cred; - if (nla_put_be32(skb, NFQA_UID, - htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) - goto nla_put_failure; - if (nla_put_be32(skb, NFQA_GID, - htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) - goto nla_put_failure; - } - read_unlock_bh(&sk->sk_callback_lock); - return 0; - -nla_put_failure: - read_unlock_bh(&sk->sk_callback_lock); - return -1; -} - -static struct sk_buff * -nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, - struct nf_queue_entry *entry, - __be32 **packet_id_ptr) -{ - size_t size; - size_t data_len = 0, cap_len = 0; - unsigned int hlen = 0; - struct sk_buff *skb; - struct nlattr *nla; - struct nfqnl_msg_packet_hdr *pmsg; - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - struct sk_buff *entskb = entry->skb; - struct net_device *indev; - struct net_device *outdev; - struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); - bool csum_verify; - - size = nlmsg_total_size(sizeof(struct nfgenmsg)) - + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ - + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#endif - + nla_total_size(sizeof(u_int32_t)) /* mark */ - + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) - + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ - + nla_total_size(sizeof(u_int32_t)); /* cap_len */ - - if (entskb->tstamp.tv64) - size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); - - if (entry->state.hook <= NF_INET_FORWARD || - (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) - csum_verify = !skb_csum_unnecessary(entskb); - else - csum_verify = false; - - outdev = entry->state.out; - - switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { - case NFQNL_COPY_META: - case NFQNL_COPY_NONE: - break; - - case NFQNL_COPY_PACKET: - if (!(queue->flags & NFQA_CFG_F_GSO) && - entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) - return NULL; - - data_len = ACCESS_ONCE(queue->copy_range); - if (data_len > entskb->len) - data_len = entskb->len; - - hlen = skb_zerocopy_headlen(entskb); - hlen = min_t(unsigned int, hlen, data_len); - size += sizeof(struct nlattr) + hlen; - cap_len = entskb->len; - break; - } - - if (queue->flags & NFQA_CFG_F_CONNTRACK) - ct = nfqnl_ct_get(entskb, &size, &ctinfo); - - if (queue->flags & NFQA_CFG_F_UID_GID) { - size += (nla_total_size(sizeof(u_int32_t)) /* uid */ - + nla_total_size(sizeof(u_int32_t))); /* gid */ - } - - skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, - GFP_ATOMIC); - if (!skb) { - skb_tx_error(entskb); - return NULL; - } - - nlh = nlmsg_put(skb, 0, 0, - NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, - sizeof(struct nfgenmsg), 0); - if (!nlh) { - skb_tx_error(entskb); - kfree_skb(skb); - return NULL; - } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); - - nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); - pmsg = nla_data(nla); - pmsg->hw_protocol = entskb->protocol; - pmsg->hook = entry->state.hook; - *packet_id_ptr = &pmsg->packet_id; - - indev = entry->state.in; - if (indev) { -#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) - goto nla_put_failure; -#else - if (entry->state.pf == PF_BRIDGE) { - /* Case 1: indev is physical input device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(indev->ifindex)) || - /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by __nf_queue */ - nla_put_be32(skb, NFQA_IFINDEX_INDEV, - htonl(br_port_get_rcu(indev)->br->dev->ifindex))) - goto nla_put_failure; - } else { - int physinif; - - /* Case 2: indev is bridge group, we need to look for - * physical device (when called from ipv4) */ - if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, - htonl(indev->ifindex))) - goto nla_put_failure; - - physinif = nf_bridge_get_physinif(entskb); - if (physinif && - nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(physinif))) - goto nla_put_failure; - } -#endif - } - - if (outdev) { -#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) - goto nla_put_failure; -#else - if (entry->state.pf == PF_BRIDGE) { - /* Case 1: outdev is physical output device, we need to - * look for bridge group (when called from - * netfilter_bridge) */ - if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(outdev->ifindex)) || - /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by __nf_queue */ - nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, - htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) - goto nla_put_failure; - } else { - int physoutif; - - /* Case 2: outdev is bridge group, we need to look for - * physical output device (when called from ipv4) */ - if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, - htonl(outdev->ifindex))) - goto nla_put_failure; - - physoutif = nf_bridge_get_physoutif(entskb); - if (physoutif && - nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(physoutif))) - goto nla_put_failure; - } -#endif - } - - if (entskb->mark && - nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) - goto nla_put_failure; - - if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { - struct nfqnl_msg_packet_hw phw; - int len; - - memset(&phw, 0, sizeof(phw)); - len = dev_parse_header(entskb, phw.hw_addr); - if (len) { - phw.hw_addrlen = htons(len); - if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) - goto nla_put_failure; - } - } - - if (entskb->tstamp.tv64) { - struct nfqnl_msg_packet_timestamp ts; - struct timeval tv = ktime_to_timeval(entskb->tstamp); - ts.sec = cpu_to_be64(tv.tv_sec); - ts.usec = cpu_to_be64(tv.tv_usec); - - if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) - goto nla_put_failure; - } - - if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && - nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) - goto nla_put_failure; - - if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) - goto nla_put_failure; - - if (cap_len > data_len && - nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) - goto nla_put_failure; - - if (nfqnl_put_packet_info(skb, entskb, csum_verify)) - goto nla_put_failure; - - if (data_len) { - struct nlattr *nla; - - if (skb_tailroom(skb) < sizeof(*nla) + hlen) - goto nla_put_failure; - - nla = (struct nlattr *)skb_put(skb, sizeof(*nla)); - nla->nla_type = NFQA_PAYLOAD; - nla->nla_len = nla_attr_size(data_len); - - if (skb_zerocopy(skb, entskb, data_len, hlen)) - goto nla_put_failure; - } - - nlh->nlmsg_len = skb->len; - return skb; - -nla_put_failure: - skb_tx_error(entskb); - kfree_skb(skb); - net_err_ratelimited("nf_queue: error creating packet message\n"); - return NULL; -} - -static int -__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, - struct nf_queue_entry *entry) -{ - struct sk_buff *nskb; - int err = -ENOBUFS; - __be32 *packet_id_ptr; - int failopen = 0; - - nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); - if (nskb == NULL) { - err = -ENOMEM; - goto err_out; - } - spin_lock_bh(&queue->lock); - - if (queue->queue_total >= queue->queue_maxlen) { - if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { - failopen = 1; - err = 0; - } else { - queue->queue_dropped++; - net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", - queue->queue_total); - } - goto err_out_free_nskb; - } - entry->id = ++queue->id_sequence; - *packet_id_ptr = htonl(entry->id); - - /* nfnetlink_unicast will either free the nskb or add it to a socket */ - err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT); - if (err < 0) { - queue->queue_user_dropped++; - goto err_out_unlock; - } - - __enqueue_entry(queue, entry); - - spin_unlock_bh(&queue->lock); - return 0; - -err_out_free_nskb: - kfree_skb(nskb); -err_out_unlock: - spin_unlock_bh(&queue->lock); - if (failopen) - nf_reinject(entry, NF_ACCEPT); -err_out: - return err; -} - -static struct nf_queue_entry * -nf_queue_entry_dup(struct nf_queue_entry *e) -{ - struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) { - if (nf_queue_entry_get_refs(entry)) - return entry; - kfree(entry); - } - return NULL; -} - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -/* When called from bridge netfilter, skb->data must point to MAC header - * before calling skb_gso_segment(). Else, original MAC header is lost - * and segmented skbs will be sent to wrong destination. - */ -static void nf_bridge_adjust_skb_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_push(skb, skb->network_header - skb->mac_header); -} - -static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) -{ - if (skb->nf_bridge) - __skb_pull(skb, skb->network_header - skb->mac_header); -} -#else -#define nf_bridge_adjust_skb_data(s) do {} while (0) -#define nf_bridge_adjust_segmented_data(s) do {} while (0) -#endif - -static void free_entry(struct nf_queue_entry *entry) -{ - nf_queue_entry_release_refs(entry); - kfree(entry); -} - -static int -__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, - struct sk_buff *skb, struct nf_queue_entry *entry) -{ - int ret = -ENOMEM; - struct nf_queue_entry *entry_seg; - - nf_bridge_adjust_segmented_data(skb); - - if (skb->next == NULL) { /* last packet, no need to copy entry */ - struct sk_buff *gso_skb = entry->skb; - entry->skb = skb; - ret = __nfqnl_enqueue_packet(net, queue, entry); - if (ret) - entry->skb = gso_skb; - return ret; - } - - skb->next = NULL; - - entry_seg = nf_queue_entry_dup(entry); - if (entry_seg) { - entry_seg->skb = skb; - ret = __nfqnl_enqueue_packet(net, queue, entry_seg); - if (ret) - free_entry(entry_seg); - } - return ret; -} - -static int -nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ - unsigned int queued; - struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; - int err = -ENOBUFS; - struct net *net = dev_net(entry->state.in ? - entry->state.in : entry->state.out); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - /* rcu_read_lock()ed by nf_hook_slow() */ - queue = instance_lookup(q, queuenum); - if (!queue) - return -ESRCH; - - if (queue->copy_mode == NFQNL_COPY_NONE) - return -EINVAL; - - skb = entry->skb; - - switch (entry->state.pf) { - case NFPROTO_IPV4: - skb->protocol = htons(ETH_P_IP); - break; - case NFPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - } - - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) - return __nfqnl_enqueue_packet(net, queue, entry); - - nf_bridge_adjust_skb_data(skb); - segs = skb_gso_segment(skb, 0); - /* Does not use PTR_ERR to limit the number of error codes that can be - * returned by nf_queue. For instance, callers rely on -ECANCELED to - * mean 'ignore this hook'. - */ - if (IS_ERR_OR_NULL(segs)) - goto out_err; - queued = 0; - err = 0; - do { - struct sk_buff *nskb = segs->next; - if (err == 0) - err = __nfqnl_enqueue_packet_gso(net, queue, - segs, entry); - if (err == 0) - queued++; - else - kfree_skb(segs); - segs = nskb; - } while (segs); - - if (queued) { - if (err) /* some segments are already queued */ - free_entry(entry); - kfree_skb(skb); - return 0; - } - out_err: - nf_bridge_adjust_segmented_data(skb); - return err; -} - -static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) -{ - struct sk_buff *nskb; - - if (diff < 0) { - if (pskb_trim(e->skb, data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), - diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - kfree_skb(e->skb); - e->skb = nskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(e->skb, data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, data, data_len); - e->skb->ip_summed = CHECKSUM_NONE; - return 0; -} - -static int -nfqnl_set_mode(struct nfqnl_instance *queue, - unsigned char mode, unsigned int range) -{ - int status = 0; - - spin_lock_bh(&queue->lock); - switch (mode) { - case NFQNL_COPY_NONE: - case NFQNL_COPY_META: - queue->copy_mode = mode; - queue->copy_range = 0; - break; - - case NFQNL_COPY_PACKET: - queue->copy_mode = mode; - if (range == 0 || range > NFQNL_MAX_COPY_RANGE) - queue->copy_range = NFQNL_MAX_COPY_RANGE; - else - queue->copy_range = range; - break; - - default: - status = -EINVAL; - - } - spin_unlock_bh(&queue->lock); - - return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ - if (entry->state.in) - if (entry->state.in->ifindex == ifindex) - return 1; - if (entry->state.out) - if (entry->state.out->ifindex == ifindex) - return 1; -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - int physinif, physoutif; - - physinif = nf_bridge_get_physinif(entry->skb); - physoutif = nf_bridge_get_physoutif(entry->skb); - - if (physinif == ifindex || physoutif == ifindex) - return 1; - } -#endif - return 0; -} - -/* drop all packets with either indev or outdev == ifindex from all queue - * instances */ -static void -nfqnl_dev_drop(struct net *net, int ifindex) -{ - int i; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - rcu_read_lock(); - - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct nfqnl_instance *inst; - struct hlist_head *head = &q->instance_table[i]; - - hlist_for_each_entry_rcu(inst, head, hlist) - nfqnl_flush(inst, dev_cmp, ifindex); - } - - rcu_read_unlock(); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static int -nfqnl_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - nfqnl_dev_drop(dev_net(dev), dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_dev_notifier = { - .notifier_call = nfqnl_rcv_dev_event, -}; - -static int -nfqnl_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); - - if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { - int i; - - /* destroy all instances for this portid */ - spin_lock(&q->instances_lock); - for (i = 0; i < INSTANCE_BUCKETS; i++) { - struct hlist_node *t2; - struct nfqnl_instance *inst; - struct hlist_head *head = &q->instance_table[i]; - - hlist_for_each_entry_safe(inst, t2, head, hlist) { - if (n->portid == inst->peer_portid) - __instance_destroy(inst); - } - } - spin_unlock(&q->instances_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block nfqnl_rtnl_notifier = { - .notifier_call = nfqnl_rcv_nl_event, -}; - -static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { - [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, - [NFQA_MARK] = { .type = NLA_U32 }, - [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, - [NFQA_CT] = { .type = NLA_UNSPEC }, - [NFQA_EXP] = { .type = NLA_UNSPEC }, -}; - -static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { - [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, - [NFQA_MARK] = { .type = NLA_U32 }, -}; - -static struct nfqnl_instance * -verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) -{ - struct nfqnl_instance *queue; - - queue = instance_lookup(q, queue_num); - if (!queue) - return ERR_PTR(-ENODEV); - - if (queue->peer_portid != nlportid) - return ERR_PTR(-EPERM); - - return queue; -} - -static struct nfqnl_msg_verdict_hdr* -verdicthdr_get(const struct nlattr * const nfqa[]) -{ - struct nfqnl_msg_verdict_hdr *vhdr; - unsigned int verdict; - - if (!nfqa[NFQA_VERDICT_HDR]) - return NULL; - - vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); - verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; - if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) - return NULL; - return vhdr; -} - -static int nfq_id_after(unsigned int id, unsigned int max) -{ - return (int)(id - max) > 0; -} - -static int -nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_queue_entry *entry, *tmp; - unsigned int verdict, maxid; - struct nfqnl_msg_verdict_hdr *vhdr; - struct nfqnl_instance *queue; - LIST_HEAD(batch_list); - u16 queue_num = ntohs(nfmsg->res_id); - - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) - return PTR_ERR(queue); - - vhdr = verdicthdr_get(nfqa); - if (!vhdr) - return -EINVAL; - - verdict = ntohl(vhdr->verdict); - maxid = ntohl(vhdr->id); - - spin_lock_bh(&queue->lock); - - list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { - if (nfq_id_after(entry->id, maxid)) - break; - __dequeue_entry(queue, entry); - list_add_tail(&entry->list, &batch_list); - } - - spin_unlock_bh(&queue->lock); - - if (list_empty(&batch_list)) - return -ENOENT; - - list_for_each_entry_safe(entry, tmp, &batch_list, list) { - if (nfqa[NFQA_MARK]) - entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - nf_reinject(entry, verdict); - } - return 0; -} - -static int -nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - - struct nfqnl_msg_verdict_hdr *vhdr; - struct nfqnl_instance *queue; - unsigned int verdict; - struct nf_queue_entry *entry; - enum ip_conntrack_info uninitialized_var(ctinfo); - struct nf_conn *ct = NULL; - - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - queue = instance_lookup(q, queue_num); - if (!queue) - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) - return PTR_ERR(queue); - - vhdr = verdicthdr_get(nfqa); - if (!vhdr) - return -EINVAL; - - verdict = ntohl(vhdr->verdict); - - entry = find_dequeue_entry(queue, ntohl(vhdr->id)); - if (entry == NULL) - return -ENOENT; - - if (nfqa[NFQA_CT]) { - ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); - if (ct && nfqa[NFQA_EXP]) { - nfqnl_attach_expect(ct, nfqa[NFQA_EXP], - NETLINK_CB(skb).portid, - nlmsg_report(nlh)); - } - } - - if (nfqa[NFQA_PAYLOAD]) { - u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); - int diff = payload_len - entry->skb->len; - - if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), - payload_len, entry, diff) < 0) - verdict = NF_DROP; - - if (ct) - nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff); - } - - if (nfqa[NFQA_MARK]) - entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - - nf_reinject(entry, verdict); - return 0; -} - -static int -nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - return -ENOTSUPP; -} - -static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { - [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, - [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, -}; - -static const struct nf_queue_handler nfqh = { - .outfn = &nfqnl_enqueue_packet, -}; - -static int -nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) -{ - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; - struct nfqnl_msg_config_cmd *cmd = NULL; - struct net *net = sock_net(ctnl); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - int ret = 0; - - if (nfqa[NFQA_CFG_CMD]) { - cmd = nla_data(nfqa[NFQA_CFG_CMD]); - - /* Obsolete commands without queue context */ - switch (cmd->command) { - case NFQNL_CFG_CMD_PF_BIND: return 0; - case NFQNL_CFG_CMD_PF_UNBIND: return 0; - } - } - - rcu_read_lock(); - queue = instance_lookup(q, queue_num); - if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { - ret = -EPERM; - goto err_out_unlock; - } - - if (cmd != NULL) { - switch (cmd->command) { - case NFQNL_CFG_CMD_BIND: - if (queue) { - ret = -EBUSY; - goto err_out_unlock; - } - queue = instance_create(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) { - ret = PTR_ERR(queue); - goto err_out_unlock; - } - break; - case NFQNL_CFG_CMD_UNBIND: - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - instance_destroy(q, queue); - break; - case NFQNL_CFG_CMD_PF_BIND: - case NFQNL_CFG_CMD_PF_UNBIND: - break; - default: - ret = -ENOTSUPP; - break; - } - } - - if (nfqa[NFQA_CFG_PARAMS]) { - struct nfqnl_msg_config_params *params; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - params = nla_data(nfqa[NFQA_CFG_PARAMS]); - nfqnl_set_mode(queue, params->copy_mode, - ntohl(params->copy_range)); - } - - if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { - __be32 *queue_maxlen; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); - spin_lock_bh(&queue->lock); - queue->queue_maxlen = ntohl(*queue_maxlen); - spin_unlock_bh(&queue->lock); - } - - if (nfqa[NFQA_CFG_FLAGS]) { - __u32 flags, mask; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - - if (!nfqa[NFQA_CFG_MASK]) { - /* A mask is needed to specify which flags are being - * changed. - */ - ret = -EINVAL; - goto err_out_unlock; - } - - flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); - mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); - - if (flags >= NFQA_CFG_F_MAX) { - ret = -EOPNOTSUPP; - goto err_out_unlock; - } - - spin_lock_bh(&queue->lock); - queue->flags &= ~mask; - queue->flags |= flags & mask; - spin_unlock_bh(&queue->lock); - } - -err_out_unlock: - rcu_read_unlock(); - return ret; -} - -static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_policy }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .policy = nfqa_cfg_policy }, - [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_batch_policy }, -}; - -static const struct nfnetlink_subsystem nfqnl_subsys = { - .name = "nf_queue", - .subsys_id = NFNL_SUBSYS_QUEUE, - .cb_count = NFQNL_MSG_MAX, - .cb = nfqnl_cb, -}; - -#ifdef CONFIG_PROC_FS -struct iter_state { - struct seq_net_private p; - unsigned int bucket; -}; - -static struct hlist_node *get_first(struct seq_file *seq) -{ - struct iter_state *st = seq->private; - struct net *net; - struct nfnl_queue_net *q; - - if (!st) - return NULL; - - net = seq_file_net(seq); - q = nfnl_queue_pernet(net); - for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { - if (!hlist_empty(&q->instance_table[st->bucket])) - return q->instance_table[st->bucket].first; - } - return NULL; -} - -static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) -{ - struct iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - h = h->next; - while (!h) { - struct nfnl_queue_net *q; - - if (++st->bucket >= INSTANCE_BUCKETS) - return NULL; - - q = nfnl_queue_pernet(net); - h = q->instance_table[st->bucket].first; - } - return h; -} - -static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_node *head; - head = get_first(seq); - - if (head) - while (pos && (head = get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *seq_start(struct seq_file *s, loff_t *pos) - __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) -{ - spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); - return get_idx(s, *pos); -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return get_next(s, v); -} - -static void seq_stop(struct seq_file *s, void *v) - __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) -{ - spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); -} - -static int seq_show(struct seq_file *s, void *v) -{ - const struct nfqnl_instance *inst = v; - - seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", - inst->queue_num, - inst->peer_portid, inst->queue_total, - inst->copy_mode, inst->copy_range, - inst->queue_dropped, inst->queue_user_dropped, - inst->id_sequence, 1); - return seq_has_overflowed(s); -} - -static const struct seq_operations nfqnl_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nfqnl_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfqnl_seq_ops, - sizeof(struct iter_state)); -} - -static const struct file_operations nfqnl_file_ops = { - .owner = THIS_MODULE, - .open = nfqnl_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif /* PROC_FS */ - -static int __net_init nfnl_queue_net_init(struct net *net) -{ - unsigned int i; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); - - for (i = 0; i < INSTANCE_BUCKETS; i++) - INIT_HLIST_HEAD(&q->instance_table[i]); - - spin_lock_init(&q->instances_lock); - -#ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_queue", 0440, - net->nf.proc_netfilter, &nfqnl_file_ops)) - return -ENOMEM; -#endif - return 0; -} - -static void __net_exit nfnl_queue_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); -#endif -} - -static struct pernet_operations nfnl_queue_net_ops = { - .init = nfnl_queue_net_init, - .exit = nfnl_queue_net_exit, - .id = &nfnl_queue_net_id, - .size = sizeof(struct nfnl_queue_net), -}; - -static int __init nfnetlink_queue_init(void) -{ - int status; - - status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); - goto out; - } - - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - pr_err("nf_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - - register_netdevice_notifier(&nfqnl_dev_notifier); - nf_register_queue_handler(&nfqh); - return status; - -cleanup_netlink_notifier: - netlink_unregister_notifier(&nfqnl_rtnl_notifier); -out: - return status; -} - -static void __exit nfnetlink_queue_fini(void) -{ - nf_unregister_queue_handler(); - unregister_netdevice_notifier(&nfqnl_dev_notifier); - nfnetlink_subsys_unregister(&nfqnl_subsys); - netlink_unregister_notifier(&nfqnl_rtnl_notifier); - unregister_pernet_subsys(&nfnl_queue_net_ops); - - rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} - -MODULE_DESCRIPTION("netfilter packet queue handler"); -MODULE_AUTHOR("Harald Welte "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); - -module_init(nfnetlink_queue_init); -module_exit(nfnetlink_queue_fini); diff --git a/kernel/net/netfilter/nfnetlink_queue_ct.c b/kernel/net/netfilter/nfnetlink_queue_ct.c deleted file mode 100644 index 96cac50e0..000000000 --- a/kernel/net/netfilter/nfnetlink_queue_ct.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * (C) 2012 by Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include -#include -#include -#include -#include -#include - -struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size, - enum ip_conntrack_info *ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nf_conn *ct; - - /* rcu_read_lock()ed by __nf_queue already. */ - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return NULL; - - ct = nf_ct_get(entskb, ctinfo); - if (ct) { - if (!nf_ct_is_untracked(ct)) - *size += nfq_ct->build_size(ct); - else - ct = NULL; - } - return ct; -} - -struct nf_conn * -nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr, - enum ip_conntrack_info *ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nf_conn *ct; - - /* rcu_read_lock()ed by __nf_queue already. */ - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return NULL; - - ct = nf_ct_get(skb, ctinfo); - if (ct && !nf_ct_is_untracked(ct)) - nfq_ct->parse(attr, ct); - - return ct; -} - -int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - struct nfq_ct_hook *nfq_ct; - struct nlattr *nest_parms; - u_int32_t tmp; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return 0; - - nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; - - if (nfq_ct->build(skb, ct) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest_parms); - - tmp = ctinfo; - if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -1; -} - -void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, int diff) -{ - struct nfq_ct_hook *nfq_ct; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return; - - if ((ct->status & IPS_NAT_MASK) && diff) - nfq_ct->seq_adjust(skb, ct, ctinfo, diff); -} - -int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, - u32 portid, u32 report) -{ - struct nfq_ct_hook *nfq_ct; - - if (nf_ct_is_untracked(ct)) - return 0; - - nfq_ct = rcu_dereference(nfq_ct_hook); - if (nfq_ct == NULL) - return -EOPNOTSUPP; - - return nfq_ct->attach_expect(attr, ct, portid, report); -} diff --git a/kernel/net/netfilter/nft_compat.c b/kernel/net/netfilter/nft_compat.c index 7f29cfc76..9c8fab001 100644 --- a/kernel/net/netfilter/nft_compat.c +++ b/kernel/net/netfilter/nft_compat.c @@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) @@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) @@ -617,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -624,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -639,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -691,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -698,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -713,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); diff --git a/kernel/net/netfilter/nft_counter.c b/kernel/net/netfilter/nft_counter.c index 175912392..c7808fc19 100644 --- a/kernel/net/netfilter/nft_counter.c +++ b/kernel/net/netfilter/nft_counter.c @@ -18,39 +18,66 @@ #include struct nft_counter { - seqlock_t lock; u64 bytes; u64 packets; }; +struct nft_counter_percpu { + struct nft_counter counter; + struct u64_stats_sync syncp; +}; + +struct nft_counter_percpu_priv { + struct nft_counter_percpu __percpu *counter; +}; + static void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu *this_cpu; + + local_bh_disable(); + this_cpu = this_cpu_ptr(priv->counter); + u64_stats_update_begin(&this_cpu->syncp); + this_cpu->counter.bytes += pkt->skb->len; + this_cpu->counter.packets++; + u64_stats_update_end(&this_cpu->syncp); + local_bh_enable(); +} - write_seqlock_bh(&priv->lock); - priv->bytes += pkt->skb->len; - priv->packets++; - write_sequnlock_bh(&priv->lock); +static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) +{ + const struct nft_counter_percpu *cpu_stats; + u64 bytes, packets; + unsigned int seq; + int cpu; + + memset(total, 0, sizeof(*total)); + for_each_possible_cpu(cpu) { + cpu_stats = per_cpu_ptr(counter, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + bytes = cpu_stats->counter.bytes; + packets = cpu_stats->counter.packets; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + + total->packets += packets; + total->bytes += bytes; + } } static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) { - struct nft_counter *priv = nft_expr_priv(expr); - unsigned int seq; - u64 bytes; - u64 packets; + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter total; - do { - seq = read_seqbegin(&priv->lock); - bytes = priv->bytes; - packets = priv->packets; - } while (read_seqretry(&priv->lock, seq)); + nft_counter_fetch(priv->counter, &total); - if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || + nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) goto nla_put_failure; return 0; @@ -67,24 +94,71 @@ static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + + cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + if (tb[NFTA_COUNTER_PACKETS]) { + this_cpu->counter.packets = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + } + if (tb[NFTA_COUNTER_BYTES]) { + this_cpu->counter.bytes = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + } + preempt_enable(); + priv->counter = cpu_stats; + return 0; +} - if (tb[NFTA_COUNTER_PACKETS]) - priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); - if (tb[NFTA_COUNTER_BYTES]) - priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); +static void nft_counter_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - seqlock_init(&priv->lock); + free_percpu(priv->counter); +} + +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(src); + struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); + + cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, + GFP_ATOMIC); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + this_cpu->counter.packets = total.packets; + this_cpu->counter.bytes = total.bytes; + preempt_enable(); + + priv_clone->counter = cpu_stats; return 0; } static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), .eval = nft_counter_eval, .init = nft_counter_init, + .destroy = nft_counter_destroy, .dump = nft_counter_dump, + .clone = nft_counter_clone, }; static struct nft_expr_type nft_counter_type __read_mostly = { diff --git a/kernel/net/netfilter/nft_ct.c b/kernel/net/netfilter/nft_ct.c index 8cbca3432..939921532 100644 --- a/kernel/net/netfilter/nft_ct.c +++ b/kernel/net/netfilter/nft_ct.c @@ -366,6 +366,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: diff --git a/kernel/net/netfilter/nft_dynset.c b/kernel/net/netfilter/nft_dynset.c index 513a8ef60..9dec3bd1b 100644 --- a/kernel/net/netfilter/nft_dynset.c +++ b/kernel/net/netfilter/nft_dynset.c @@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, } ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL) - nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + if (priv->expr != NULL && + nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + return NULL; return elem; } diff --git a/kernel/net/netfilter/nft_limit.c b/kernel/net/netfilter/nft_limit.c index 435c1ccd6..5d67938f8 100644 --- a/kernel/net/netfilter/nft_limit.c +++ b/kernel/net/netfilter/nft_limit.c @@ -20,63 +20,79 @@ static DEFINE_SPINLOCK(limit_lock); struct nft_limit { + u64 last; u64 tokens; + u64 tokens_max; u64 rate; - u64 unit; - unsigned long stamp; + u64 nsecs; + u32 burst; }; -static void nft_limit_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) { - struct nft_limit *priv = nft_expr_priv(expr); + u64 now, tokens; + s64 delta; spin_lock_bh(&limit_lock); - if (time_after_eq(jiffies, priv->stamp)) { - priv->tokens = priv->rate; - priv->stamp = jiffies + priv->unit * HZ; - } - - if (priv->tokens >= 1) { - priv->tokens--; + now = ktime_get_ns(); + tokens = limit->tokens + now - limit->last; + if (tokens > limit->tokens_max) + tokens = limit->tokens_max; + + limit->last = now; + delta = tokens - cost; + if (delta >= 0) { + limit->tokens = delta; spin_unlock_bh(&limit_lock); - return; + return false; } + limit->tokens = tokens; spin_unlock_bh(&limit_lock); - - regs->verdict.code = NFT_BREAK; + return true; } -static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { - [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, - [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, -}; - -static int nft_limit_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, +static int nft_limit_init(struct nft_limit *limit, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); + u64 unit; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); - priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->stamp = jiffies + priv->unit * HZ; - priv->tokens = priv->rate; + limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); + limit->nsecs = unit * NSEC_PER_SEC; + if (limit->rate == 0 || limit->nsecs < unit) + return -EOVERFLOW; + limit->tokens = limit->tokens_max = limit->nsecs; + + if (tb[NFTA_LIMIT_BURST]) { + u64 rate; + + limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + + rate = limit->rate + limit->burst; + if (rate < limit->rate) + return -EOVERFLOW; + + limit->rate = rate; + } + limit->last = ktime_get_ns(); + return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, + enum nft_limit_type type) { - const struct nft_limit *priv = nft_expr_priv(expr); + u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); + u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || + nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type))) goto nla_put_failure; return 0; @@ -84,18 +100,114 @@ nla_put_failure: return -1; } +struct nft_limit_pkts { + struct nft_limit limit; + u64 cost; +}; + +static void nft_limit_pkts_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit_pkts *priv = nft_expr_priv(expr); + + if (nft_limit_eval(&priv->limit, priv->cost)) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { + [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, + [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, + [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, + [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, +}; + +static int nft_limit_pkts_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit_pkts *priv = nft_expr_priv(expr); + int err; + + err = nft_limit_init(&priv->limit, tb); + if (err < 0) + return err; + + priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate); + return 0; +} + +static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_limit_pkts *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); +} + static struct nft_expr_type nft_limit_type; -static const struct nft_expr_ops nft_limit_ops = { +static const struct nft_expr_ops nft_limit_pkts_ops = { + .type = &nft_limit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .eval = nft_limit_pkts_eval, + .init = nft_limit_pkts_init, + .dump = nft_limit_pkts_dump, +}; + +static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate); + + if (nft_limit_eval(priv, cost)) + regs->verdict.code = NFT_BREAK; +} + +static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_init(priv, tb); +} + +static int nft_limit_pkt_bytes_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); +} + +static const struct nft_expr_ops nft_limit_pkt_bytes_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), - .eval = nft_limit_eval, - .init = nft_limit_init, - .dump = nft_limit_dump, + .eval = nft_limit_pkt_bytes_eval, + .init = nft_limit_pkt_bytes_init, + .dump = nft_limit_pkt_bytes_dump, }; +static const struct nft_expr_ops * +nft_limit_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_LIMIT_TYPE] == NULL) + return &nft_limit_pkts_ops; + + switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { + case NFT_LIMIT_PKTS: + return &nft_limit_pkts_ops; + case NFT_LIMIT_PKT_BYTES: + return &nft_limit_pkt_bytes_ops; + } + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", - .ops = &nft_limit_ops, + .select_ops = nft_limit_select_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, diff --git a/kernel/net/netfilter/nft_log.c b/kernel/net/netfilter/nft_log.c index a13d6a386..319c22b4b 100644 --- a/kernel/net/netfilter/nft_log.c +++ b/kernel/net/netfilter/nft_log.c @@ -31,9 +31,8 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_packet(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } diff --git a/kernel/net/netfilter/nft_meta.c b/kernel/net/netfilter/nft_meta.c index 52561e1c3..9dfaf4d55 100644 --- a/kernel/net/netfilter/nft_meta.c +++ b/kernel/net/netfilter/nft_meta.c @@ -31,6 +31,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; + struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -42,7 +43,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: - *dest = pkt->ops->pf; + *dest = pkt->pf; break; case NFT_META_L4PROTO: *dest = pkt->tprot; @@ -86,33 +87,35 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(u16 *)dest = out->type; break; case NFT_META_SKUID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kuid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsuid); + read_unlock_bh(&sk->sk_callback_lock); break; case NFT_META_SKGID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kgid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { @@ -135,7 +138,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) *dest = PACKET_MULTICAST; @@ -166,11 +169,14 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; *dest = out->group; break; +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - *dest = skb->sk->sk_classid; + *dest = sk->sk_classid; break; +#endif default: WARN_ON(1); goto err; @@ -246,7 +252,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: +#endif len = sizeof(u32); break; case NFT_META_IIFNAME: diff --git a/kernel/net/netfilter/nft_payload.c b/kernel/net/netfilter/nft_payload.c index 94fb3b27a..09b4b07eb 100644 --- a/kernel/net/netfilter/nft_payload.c +++ b/kernel/net/netfilter/nft_payload.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -17,6 +18,53 @@ #include #include +/* add vlan header into the user buffer for if tag was removed by offloads */ +static bool +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +{ + int mac_off = skb_mac_header(skb) - skb->data; + u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + struct vlan_ethhdr veth; + + vlanh = (u8 *) &veth; + if (offset < ETH_HLEN) { + u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + + if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + return false; + + veth.h_vlan_proto = skb->vlan_proto; + + memcpy(dst_u8, vlanh + offset, ethlen); + + len -= ethlen; + if (len == 0) + return true; + + dst_u8 += ethlen; + offset = ETH_HLEN; + } else if (offset >= VLAN_ETH_HLEN) { + offset -= VLAN_HLEN; + goto skip; + } + + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth.h_vlan_encapsulated_proto = skb->protocol; + + vlanh += offset; + + vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); + memcpy(dst_u8, vlanh, vlan_len); + + len -= vlan_len; + if (!len) + return true; + + dst_u8 += vlan_len; + skip: + return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; +} + static void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -26,10 +74,18 @@ static void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; + dest[priv->len / NFT_REG32_SIZE] = 0; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; + + if (skb_vlan_tag_present(skb)) { + if (!nft_payload_copy_vlan(dest, skb, + priv->offset, priv->len)) + goto err; + return; + } offset = skb_mac_header(skb) - skb->data; break; case NFT_PAYLOAD_NETWORK_HEADER: @@ -43,7 +99,6 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; diff --git a/kernel/net/netfilter/nft_queue.c b/kernel/net/netfilter/nft_queue.c index 96805d21d..61d216eb7 100644 --- a/kernel/net/netfilter/nft_queue.c +++ b/kernel/net/netfilter/nft_queue.c @@ -42,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr, queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, - priv->queues_total, pkt->ops->pf, + priv->queues_total, pkt->pf, jhash_initval); } } diff --git a/kernel/net/netfilter/nft_reject_inet.c b/kernel/net/netfilter/nft_reject_inet.c index 635dbba93..759ca5248 100644 --- a/kernel/net/netfilter/nft_reject_inet.c +++ b/kernel/net/netfilter/nft_reject_inet.c @@ -22,38 +22,37 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); - switch (pkt->ops->pf) { + switch (pkt->pf) { case NFPROTO_IPV4: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, nft_reject_icmp_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; case NFPROTO_IPV6: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach6(pkt->net, pkt->skb, priv->icmp_code, + pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + nf_send_reset6(pkt->net, pkt->skb, pkt->hook); break; case NFT_REJECT_ICMPX_UNREACH: - nf_send_unreach6(net, pkt->skb, + nf_send_unreach6(pkt->net, pkt->skb, nft_reject_icmpv6_code(priv->icmp_code), - pkt->ops->hooknum); + pkt->hook); break; } break; diff --git a/kernel/net/netfilter/x_tables.c b/kernel/net/netfilter/x_tables.c index 51a459c3c..d4aaad747 100644 --- a/kernel/net/netfilter/x_tables.c +++ b/kernel/net/netfilter/x_tables.c @@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; -/* Allow this many total (re)entries. */ -static const unsigned int xt_jumpstack_multiplier = 2; - /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -658,35 +655,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { - struct xt_table_info *newinfo; - int cpu; + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); - if (!newinfo) - return NULL; - - newinfo->size = size; - - for_each_possible_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, - cpu_to_node(cpu)); - else - newinfo->entries[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - - if (newinfo->entries[cpu] == NULL) { - xt_free_table_info(newinfo); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!info) { + info = vmalloc(sz); + if (!info) return NULL; - } } - - return newinfo; + memset(info, 0, sizeof(*info)); + info->size = size; + return info; } EXPORT_SYMBOL(xt_alloc_table_info); @@ -694,18 +679,13 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) - kvfree(info->entries[cpu]); - if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); kvfree(info->jumpstack); } - free_percpu(info->stackptr); - - kfree(info); + kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -747,15 +727,14 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock); DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); +struct static_key xt_tee_enabled __read_mostly; +EXPORT_SYMBOL_GPL(xt_tee_enabled); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; int cpu; - i->stackptr = alloc_percpu(unsigned int); - if (i->stackptr == NULL) - return -ENOMEM; - size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = vzalloc(size); @@ -764,8 +743,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->jumpstack == NULL) return -ENOMEM; - i->stacksize *= xt_jumpstack_multiplier; - size = sizeof(void *) * i->stacksize; + /* ruleset without jumps -- no stack needed */ + if (i->stacksize == 0) + return 0; + + /* Jumpstack needs to be able to record two full callchains, one + * from the first rule set traversal, plus one table reentrancy + * via -j TEE without clobbering the callchain that brought us to + * TEE target. + * + * This is done by allocating two jumpstacks per cpu, on reentry + * the upper half of the stack is used. + * + * see the jumpstack setup in ipt_do_table() for more details. + */ + size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { if (size > PAGE_SIZE) i->jumpstack[cpu] = vmalloc_node(size, @@ -947,11 +939,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); - if (strlen(table->name)) { + if (*table->name) seq_printf(seq, "%s\n", table->name); - return seq_has_overflowed(seq); - } else - return 0; + return 0; } static const struct seq_operations xt_table_seq_ops = { @@ -1087,10 +1077,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); - if (*match->name == '\0') - return 0; - seq_printf(seq, "%s\n", match->name); - return seq_has_overflowed(seq); + if (*match->name) + seq_printf(seq, "%s\n", match->name); } return 0; } @@ -1142,10 +1130,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); - if (*target->name == '\0') - return 0; - seq_printf(seq, "%s\n", target->name); - return seq_has_overflowed(seq); + if (*target->name) + seq_printf(seq, "%s\n", target->name); } return 0; } @@ -1207,7 +1193,6 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) if (!(hook_mask & 1)) continue; ops[i].hook = fn; - ops[i].owner = table->me; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; diff --git a/kernel/net/netfilter/xt_CT.c b/kernel/net/netfilter/xt_CT.c index 75747aecd..e7ac07e53 100644 --- a/kernel/net/netfilter/xt_CT.c +++ b/kernel/net/netfilter/xt_CT.c @@ -171,6 +171,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, if (timeout_ext == NULL) ret = -ENOMEM; + rcu_read_unlock(); + return ret; + err_put_timeout: __xt_ct_tg_timeout_put(timeout); out: @@ -181,10 +184,23 @@ out: #endif } +static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info) +{ + switch (info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL)) { + case XT_CT_ZONE_DIR_ORIG: + return NF_CT_ZONE_DIR_ORIG; + case XT_CT_ZONE_DIR_REPL: + return NF_CT_ZONE_DIR_REPL; + default: + return NF_CT_DEFAULT_ZONE_DIR; + } +} + static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { - struct nf_conntrack_tuple t; + struct nf_conntrack_zone zone; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -194,7 +210,9 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, } #ifndef CONFIG_NF_CONNTRACK_ZONES - if (info->zone) + if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL | + XT_CT_ZONE_MARK)) goto err1; #endif @@ -202,11 +220,17 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err1; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); - ret = PTR_ERR(ct); - if (IS_ERR(ct)) + memset(&zone, 0, sizeof(zone)); + zone.id = info->zone; + zone.dir = xt_ct_flags_to_dir(info); + if (info->flags & XT_CT_ZONE_MARK) + zone.flags |= NF_CT_FLAG_MARK; + + ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); + if (!ct) { + ret = -ENOMEM; goto err2; + } ret = 0; if ((info->ct_events || info->exp_events) && @@ -227,14 +251,14 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err3; } - - nf_conntrack_tmpl_insert(par->net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: @@ -297,8 +321,10 @@ static void xt_ct_destroy_timeout(struct nf_conn *ct) if (timeout_put) { timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) + if (timeout_ext) { timeout_put(timeout_ext->timeout); + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + } } rcu_read_unlock(); #endif diff --git a/kernel/net/netfilter/xt_IDLETIMER.c b/kernel/net/netfilter/xt_IDLETIMER.c index f407ebc13..29d2c31f4 100644 --- a/kernel/net/netfilter/xt_IDLETIMER.c +++ b/kernel/net/netfilter/xt_IDLETIMER.c @@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; diff --git a/kernel/net/netfilter/xt_LOG.c b/kernel/net/netfilter/xt_LOG.c index c13b79440..1763ab82b 100644 --- a/kernel/net/netfilter/xt_LOG.c +++ b/kernel/net/netfilter/xt_LOG.c @@ -33,7 +33,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; diff --git a/kernel/net/netfilter/xt_NFLOG.c b/kernel/net/netfilter/xt_NFLOG.c index fb7497c92..a1fa2c800 100644 --- a/kernel/net/netfilter/xt_NFLOG.c +++ b/kernel/net/netfilter/xt_NFLOG.c @@ -26,7 +26,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; diff --git a/kernel/net/netfilter/xt_TCPMSS.c b/kernel/net/netfilter/xt_TCPMSS.c index e762de5ee..b7c43def0 100644 --- a/kernel/net/netfilter/xt_TCPMSS.c +++ b/kernel/net/netfilter/xt_TCPMSS.c @@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); if (dst_mtu(skb_dst(skb)) <= minlen) { @@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), - 0); + false); return 0; } } @@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb, memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(len), htons(len + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; - inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } @@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; @@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; diff --git a/kernel/net/netfilter/xt_TCPOPTSTRIP.c b/kernel/net/netfilter/xt_TCPOPTSTRIP.c index 625fa1d63..eb92bffff 100644 --- a/kernel/net/netfilter/xt_TCPOPTSTRIP.c +++ b/kernel/net/netfilter/xt_TCPOPTSTRIP.c @@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, n <<= 8; } inet_proto_csum_replace2(&tcph->check, skb, htons(o), - htons(n), 0); + htons(n), false); } memset(opt + i, TCPOPT_NOP, optl); } diff --git a/kernel/net/netfilter/xt_TEE.c b/kernel/net/netfilter/xt_TEE.c index 292934d23..3eff7b67c 100644 --- a/kernel/net/netfilter/xt_TEE.c +++ b/kernel/net/netfilter/xt_TEE.c @@ -10,26 +10,15 @@ * modify it under the terms of the GNU General Public License * version 2 or later, as published by the Free Software Foundation. */ -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include #include -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -# define WITH_CONNTRACK 1 -# include -#endif - struct xt_tee_priv { struct notifier_block notifier; struct xt_tee_tginfo *tginfo; @@ -37,162 +26,27 @@ struct xt_tee_priv { }; static const union nf_inet_addr tee_zero_address; -static DEFINE_PER_CPU(bool, tee_active); - -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool -tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); - struct rtable *rt; - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl4.flowi4_oif = info->priv->oif; - } - fl4.daddr = info->gw.ip; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return false; - - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - skb->dev = rt->dst.dev; - skb->protocol = htons(ETH_P_IP); - return true; -} static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - struct iphdr *iph; + int oif = info->priv ? info->priv->oif : 0; - if (__this_cpu_read(tee_active)) - return XT_CONTINUE; - /* - * Copy the skb, and route the copy. Will later return %XT_CONTINUE for - * the original skb, which should continue on its way as if nothing has - * happened. The copy should be independently delivered to the TEE - * --gateway. - */ - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; - -#ifdef WITH_CONNTRACK - /* Avoid counting cloned packets towards the original connection. */ - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - /* - * If we are in PREROUTING/INPUT, the checksum must be recalculated - * since the length could have changed as a result of defragmentation. - * - * We also decrease the TTL to mitigate potential TEE loops - * between two hosts. - * - * Set %IP_DF so that the original source is notified of a potentially - * decreased MTU on the clone route. IPv6 does this too. - */ - iph = ip_hdr(skb); - iph->frag_off |= htons(IP_DF); - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) - --iph->ttl; - ip_send_check(iph); + nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif); - if (tee_tg_route4(skb, info)) { - __this_cpu_write(tee_active, true); - ip_local_out(skb); - __this_cpu_write(tee_active, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) -static bool -tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct net *net = pick_net(skb); - struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl6.flowi6_oif = info->priv->oif; - } - fl6.daddr = info->gw.in6; - fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | - (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - return false; - } - skb_dst_drop(skb); - skb_dst_set(skb, dst); - skb->dev = dst->dev; - skb->protocol = htons(ETH_P_IPV6); - return true; -} - +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; + int oif = info->priv ? info->priv->oif : 0; - if (__this_cpu_read(tee_active)) - return XT_CONTINUE; - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; + nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif); -#ifdef WITH_CONNTRACK - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) { - struct ipv6hdr *iph = ipv6_hdr(skb); - --iph->hop_limit; - } - if (tee_tg_route6(skb, info)) { - __this_cpu_write(tee_active, true); - ip6_local_out(skb); - __this_cpu_write(tee_active, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } #endif @@ -251,6 +105,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) } else info->priv = NULL; + static_key_slow_inc(&xt_tee_enabled); return 0; } @@ -262,6 +117,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) unregister_netdevice_notifier(&info->priv->notifier); kfree(info->priv); } + static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { @@ -275,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) { .name = "TEE", .revision = 1, diff --git a/kernel/net/netfilter/xt_TPROXY.c b/kernel/net/netfilter/xt_TPROXY.c index cca96cec1..3ab591e73 100644 --- a/kernel/net/netfilter/xt_TPROXY.c +++ b/kernel/net/netfilter/xt_TPROXY.c @@ -250,8 +250,8 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, * no such listener is found, or NULL if the TCP header is incomplete. */ static struct sock * -tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, - struct sock *sk) +tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr _hdr, *hp; @@ -267,13 +267,12 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -291,7 +290,7 @@ nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) } static unsigned int -tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, +tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) { const struct iphdr *iph = ip_hdr(skb); @@ -306,7 +305,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -318,11 +317,11 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ - sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk); else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + sk = nf_tproxy_get_sock_v4(net, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -352,7 +351,7 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } static unsigned int @@ -360,7 +359,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; - return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); + return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } #ifdef XT_TPROXY_HAVE_IPV6 @@ -430,15 +429,14 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -474,7 +472,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -489,7 +487,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = nf_tproxy_get_sock_v6(par->net, tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/kernel/net/netfilter/xt_addrtype.c b/kernel/net/netfilter/xt_addrtype.c index fab6eea1b..11d609199 100644 --- a/kernel/net/netfilter/xt_addrtype.c +++ b/kernel/net/netfilter/xt_addrtype.c @@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; - if (rt->rt6i_flags & RTF_ANYCAST) + if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); @@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev, static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -143,7 +143,7 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph; const struct net_device *dev = NULL; diff --git a/kernel/net/netfilter/xt_connlabel.c b/kernel/net/netfilter/xt_connlabel.c index 9f8719df2..bb9cbeb18 100644 --- a/kernel/net/netfilter/xt_connlabel.c +++ b/kernel/net/netfilter/xt_connlabel.c @@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; - size_t words; - - if (info->bit > XT_CONNLABEL_MAXBIT) - return -ERANGE; if (info->options & ~options) { pr_err("Unknown options in mask %x\n", info->options); @@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - par->net->ct.labels_used++; - words = BITS_TO_LONGS(info->bit+1); - if (words > par->net->ct.label_words) - par->net->ct.label_words = words; - + ret = nf_connlabels_get(par->net, info->bit + 1); + if (ret < 0) + nf_ct_l3proto_module_put(par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { - par->net->ct.labels_used--; - if (par->net->ct.labels_used == 0) - par->net->ct.label_words = 0; + nf_connlabels_put(par->net); nf_ct_l3proto_module_put(par->family); } diff --git a/kernel/net/netfilter/xt_connlimit.c b/kernel/net/netfilter/xt_connlimit.c index 29ba6218a..99bbc8298 100644 --- a/kernel/net/netfilter/xt_connlimit.c +++ b/kernel/net/netfilter/xt_connlimit.c @@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head, static unsigned int check_hlist(struct net *net, struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - u16 zone, + const struct nf_conntrack_zone *zone, bool *addit) { const struct nf_conntrack_tuple_hash *found; @@ -201,7 +201,7 @@ static unsigned int count_tree(struct net *net, struct rb_root *root, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u8 family, u16 zone) + u8 family, const struct nf_conntrack_zone *zone) { struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; @@ -290,7 +290,8 @@ static int count_them(struct net *net, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u_int8_t family, u16 zone) + u_int8_t family, + const struct nf_conntrack_zone *zone) { struct rb_root *root; int count; @@ -316,22 +317,22 @@ static int count_them(struct net *net, static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; const struct xt_connlimit_info *info = par->matchinfo; union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; - u16 zone = NF_CT_DEFAULT_ZONE; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) { tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; zone = nf_ct_zone(ct); } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - par->family, &tuple)) { + par->family, net, &tuple)) { goto hotdrop; } diff --git a/kernel/net/netfilter/xt_ipvs.c b/kernel/net/netfilter/xt_ipvs.c index 8d47c3780..71a9d95e0 100644 --- a/kernel/net/netfilter/xt_ipvs.c +++ b/kernel/net/netfilter/xt_ipvs.c @@ -48,6 +48,7 @@ static bool ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ipvs_mtinfo *data = par->matchinfo; + struct netns_ipvs *ipvs = net_ipvs(par->net); /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ const u_int8_t family = par->family; struct ip_vs_iphdr iph; @@ -67,7 +68,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) goto out; } - ip_vs_fill_iph_skb(family, skb, &iph); + ip_vs_fill_iph_skb(family, skb, true, &iph); if (data->bitmask & XT_IPVS_PROTO) if ((iph.protocol == data->l4proto) ^ @@ -85,7 +86,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */); + cp = pp->conn_out_get(ipvs, family, skb, &iph); if (unlikely(cp == NULL)) { match = false; goto out; diff --git a/kernel/net/netfilter/xt_mark.c b/kernel/net/netfilter/xt_mark.c index 233452387..ebd41dc50 100644 --- a/kernel/net/netfilter/xt_mark.c +++ b/kernel/net/netfilter/xt_mark.c @@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); +MODULE_ALIAS("arpt_MARK"); static unsigned int mark_tg(struct sk_buff *skb, const struct xt_action_param *par) diff --git a/kernel/net/netfilter/xt_nfacct.c b/kernel/net/netfilter/xt_nfacct.c index 8c646ed9c..3048a7e3a 100644 --- a/kernel/net/netfilter/xt_nfacct.c +++ b/kernel/net/netfilter/xt_nfacct.c @@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) struct xt_nfacct_match_info *info = par->matchinfo; struct nf_acct *nfacct; - nfacct = nfnl_acct_find_get(info->name); + nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { pr_info("xt_nfacct: accounting object with name `%s' " "does not exists\n", info->name); diff --git a/kernel/net/netfilter/xt_osf.c b/kernel/net/netfilter/xt_osf.c index 0778855ea..df8801e02 100644 --- a/kernel/net/netfilter/xt_osf.c +++ b/kernel/net/netfilter/xt_osf.c @@ -200,7 +200,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; - struct net *net = dev_net(p->in ? p->in : p->out); + struct net *net = p->net; if (!info) return false; diff --git a/kernel/net/netfilter/xt_owner.c b/kernel/net/netfilter/xt_owner.c index ca2e577ed..1302b475a 100644 --- a/kernel/net/netfilter/xt_owner.c +++ b/kernel/net/netfilter/xt_owner.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; + struct sock *sk = skb_to_full_sk(skb); - if (skb->sk == NULL || skb->sk->sk_socket == NULL) + if (sk == NULL || sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* @@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = skb->sk->sk_socket->file; + filp = sk->sk_socket->file; if (filp == NULL) return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; diff --git a/kernel/net/netfilter/xt_recent.c b/kernel/net/netfilter/xt_recent.c index 45e1b30e4..d725a2774 100644 --- a/kernel/net/netfilter/xt_recent.c +++ b/kernel/net/netfilter/xt_recent.c @@ -237,7 +237,7 @@ static void recent_table_flush(struct recent_table *t) static bool recent_mt(const struct sk_buff *skb, struct xt_action_param *par) { - struct net *net = dev_net(par->in ? par->in : par->out); + struct net *net = par->net; struct recent_net *recent_net = recent_pernet(net); const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; diff --git a/kernel/net/netfilter/xt_set.c b/kernel/net/netfilter/xt_set.c index 89045982e..5669e5b45 100644 --- a/kernel/net/netfilter/xt_set.c +++ b/kernel/net/netfilter/xt_set.c @@ -9,14 +9,16 @@ */ /* Kernel module which implements the set match and SET target - * for netfilter/iptables. */ + * for netfilter/iptables. + */ #include #include #include -#include +#include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -52,6 +54,7 @@ static bool set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, info->match_set.u.compat.flags, 0, UINT_MAX); @@ -68,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info) info->u.compat.dim = IPSET_DIM_ZERO; if (info->u.flags[0] & IPSET_MATCH_INV) info->u.compat.flags |= IPSET_INV_MATCH; - for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) { info->u.compat.dim++; if (info->u.flags[i] & IPSET_SRC) - info->u.compat.flags |= (1<u.compat.dim); + info->u.compat.flags |= (1 << info->u.compat.dim); } } @@ -88,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) info->match_set.index); return -ENOENT; } - if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; @@ -114,6 +117,7 @@ static bool set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); @@ -178,9 +182,10 @@ static bool set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v3 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -224,9 +229,10 @@ static bool set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v4 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -252,6 +258,7 @@ static unsigned int set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, info->add_set.u.compat.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, @@ -290,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) return -ENOENT; } } - if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || - info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -324,6 +331,7 @@ static unsigned int set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -392,6 +400,7 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -399,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -418,6 +427,8 @@ static unsigned int set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; + int ret; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -425,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) ADT_OPT(map_opt, par->family, info->map_set.dim, info->map_set.flags, 0, UINT_MAX); - int ret; - /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -456,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } - static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -496,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) !(par->hook_mask & (1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { - pr_warn("mapping of prio or/and queue is allowed only" - "from OUTPUT/FORWARD/POSTROUTING chains\n"); + pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); return -EINVAL; } index = ip_set_nfnl_get_byindex(par->net, @@ -518,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -545,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par) ip_set_nfnl_put(par->net, info->map_set.index); } - static struct xt_match set_matches[] __read_mostly = { { .name = "set", diff --git a/kernel/net/netfilter/xt_socket.c b/kernel/net/netfilter/xt_socket.c index e092cb046..2ec08f04b 100644 --- a/kernel/net/netfilter/xt_socket.c +++ b/kernel/net/netfilter/xt_socket.c @@ -143,7 +143,8 @@ static bool xt_socket_sk_is_transparent(struct sock *sk) } } -static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v4(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); @@ -197,7 +198,7 @@ static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, } #endif - return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, + return xt_socket_get_sock_v4(net, protocol, saddr, daddr, sport, dport, indev); } @@ -205,10 +206,11 @@ static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) { + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v4(skb, par->in); + sk = xt_socket_lookup_slow_v4(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -226,6 +228,10 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -247,7 +253,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) } static bool -socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -330,7 +336,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, return NULL; } -static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, +static struct sock *xt_socket_lookup_slow_v6(struct net *net, + const struct sk_buff *skb, const struct net_device *indev) { __be16 uninitialized_var(dport), uninitialized_var(sport); @@ -366,18 +373,19 @@ static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, return NULL; } - return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, tproto, saddr, daddr, sport, dport, indev); } static bool -socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (!sk) - sk = xt_socket_lookup_slow_v6(skb, par->in); + sk = xt_socket_lookup_slow_v6(par->net, skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -395,6 +403,10 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_TRANSPARENT) transparent = xt_socket_sk_is_transparent(sk); + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; + if (sk != skb->sk) sock_gen_put(sk); @@ -428,6 +440,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par) return 0; } +static int socket_mt_v3_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo3 *info = + (struct xt_socket_mtinfo3 *)par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V3) { + pr_info("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V3); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -442,7 +467,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -454,7 +479,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -466,7 +491,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -478,13 +503,37 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, +#endif + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, #endif }; -- cgit 1.2.3-korg